diff --git "a/code/pdb_featureVector.py" "b/code/pdb_featureVector.py"
--- "a/code/pdb_featureVector.py"
+++ "b/code/pdb_featureVector.py"
@@ -25,16 +25,13 @@ from Bio.PDB import PDBList
 from Bio import Align
 from Bio import SeqIO
 from Bio.PDB import *
-import streamlit as st
-from urllib.error import HTTPError
-import Bio
-
 warnings.filterwarnings("ignore")
 start = timer()
 
 # FUNCTIONS
 
 
+
 # FUNCTIONS
 from calc_pc_property import *
 from add_domains import *
@@ -52,6 +49,7 @@ from uniprotSequenceMatch import uniprotSequenceMatch
 from process_input import clean_data
 
 
+
 def pdb(input_set, mode, impute):
     aligner = Align.PairwiseAligner()
     """
@@ -60,1166 +58,1046 @@ def pdb(input_set, mode, impute):
     Add datapoint identifier and remove non-standard input.
     """
     data = clean_data(input_set)
-    path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(
-        mode)
+    path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer =  manage_files(mode)
+    out_path = path_to_output_files / 'log.txt'
+    sys.stdout = open(out_path, 'w')
     print('Creating directories...')
 
     annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
                        'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
-                       'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
-                       'region',
+                       'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
                        'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
                        'transitPeptide', 'glycosylation', 'propeptide']
 
     print('Feature vector generation started...\n')
-    cont = True
-    try:
-        if cont == False:
-            print('Feature vectore generation terminated.')
-        else:
-            """
-            STEP 2
-            Add physicochemical properties.
-            """
-            print('Adding physicochemical properties...\n')
-
-            data = add_physicochemical(data)
-
-            """
-            STEP 3
-            Add domain-related information.
-            """
-            print('Adding domains\n')
-
-            data = add_domains(data, path_to_domains)
-            data = data.astype(str)
-            data = data.replace({'NaN': 'nan'})
-            data.domain = data.domain.replace({'nan': '-1'})
-            data.domStart = data.domStart.replace({'nan': '-1'})
-            data.domEnd = data.domEnd.replace({'nan': '-1'})
-            data.distance = data.distance.replace({'nan': '-1'})
-
-            """
-            STEP 4
-            Retrieve canonical and isoform UniProt sequences.
-            Add to the data frame.
-            """
-            print('Retrieving UniProt sequences...\n')
-
-            canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence'])
-            up_list = list(set(data['uniprotID'].to_list()))
-            for i in range(len(up_list)):
-                canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i])
-                canonical_fasta.at[i, 'uniprotID'] = up_list[i]
-            canonical_fasta = canonical_fasta.drop_duplicates()
-            isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence'])
-            iso_dict = []
-            for i in range(len(up_list)):
-                iso_dict.append(get_isoforms(up_list[i]))
-
-            index = 0
-            for i in iso_dict:
-                for key, val in i.items():
-                    isoform_fasta.at[index, 'uniprotID'] = key
-                    isoform_fasta.at[index, 'isoformSequence'] = val
-                    index += 1
-            isoform_fasta = isoform_fasta.drop_duplicates()
-            for i in isoform_fasta.index:
-                isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip()
-                isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6]
-            print('Sequence files created...\n')
-
-            data = data.merge(canonical_fasta, on='uniprotID', how='left')
-            data = data.astype(str)
-            data['whichIsoform'] = 'nan'
-            data.replace({'': 'nan'}, inplace=True)
-            data['wt_sequence_match'] = ''
-            for i in data.index:
-                if len(data.at[i, 'uniprotSequence']) >= int(data.at[i, 'pos']):
-                    wt = data.at[i, 'wt']
-                    can = str(data.at[i, 'uniprotSequence'])[int(data.at[i, 'pos']) - 1]
-                    if wt == can:
-                        data.at[i, 'wt_sequence_match'] = 'm'
-                    elif wt != can:
-                        isoList = isoform_fasta[
-                            isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
-                        for k in isoList:
-                            if len(k) >= int(data.at[i, 'pos']):
-                                resInIso = k[int(int(data.at[i, 'pos']) - 1)]
-                                if wt == resInIso:
-                                    whichIsoform = \
-                                        isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
-                                    data.at[i, 'wt_sequence_match'] = 'i'
-                                    data.at[i, 'whichIsoform'] = whichIsoform
-                                    break
-
-                elif len(data.at[i, 'uniprotSequence']) < int(data.at[i, 'pos']):
-                    isoList = isoform_fasta[
-                        isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
+    if len(data) == 0:
+        print('Feature vectore generation terminated.')
+    else:
+        """
+        STEP 2
+        Add physicochemical properties.
+        """
+        print('Adding physicochemical properties...\n')
+
+        data = add_physicochemical(data)
+
+        """
+        STEP 3
+        Add domain-related information.
+        """
+        print('Adding domains\n')
+
+        data = add_domains(data, path_to_domains)
+
+        data = data.astype(str)
+        data = data.replace({'NaN': 'nan'})
+        data.domain = data.domain.replace({'nan': '-1'})
+        data.domStart = data.domStart.replace({'nan': '-1'})
+        data.domEnd = data.domEnd.replace({'nan': '-1'})
+        data.distance = data.distance.replace({'nan': '-1'})
+
+        """
+        STEP 4
+        Retrieve canonical and isoform UniProt sequences.
+        Add to the data frame.
+        """
+        print('Retrieving UniProt sequences...\n')
+
+        canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence'])
+        up_list = list(set(data['uniprotID'].to_list()))
+        for i in range(len(up_list)):
+            canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i])
+            canonical_fasta.at[i, 'uniprotID'] = up_list[i]
+
+        canonical_fasta = canonical_fasta.drop_duplicates()
+        isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence'])
+        iso_dict = []
+        for i in range(len(up_list)):
+            iso_dict.append(get_isoforms(up_list[i]))
+
+        index = 0
+        for i in iso_dict:
+            for key, val in i.items():
+                isoform_fasta.at[index, 'uniprotID'] = key
+                isoform_fasta.at[index, 'isoformSequence'] = val
+                index += 1
+        isoform_fasta = isoform_fasta.drop_duplicates()
+
+        for i in isoform_fasta.index:
+            isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip()
+            isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6]
+        print('Sequence files created...\n')
+
+        data = data.merge(canonical_fasta, on='uniprotID', how='left')
+        data = data.astype(str)
+        data['whichIsoform'] = 'nan'
+        data.replace({'': 'nan'}, inplace=True)
+        data['wt_sequence_match'] = ''
+        for i in data.index:
+            if len(data.at[i, 'uniprotSequence']) >= int(data.at[i, 'pos']):
+                wt = data.at[i, 'wt']
+                can = str(data.at[i, 'uniprotSequence'])[int(data.at[i, 'pos']) - 1]
+                if wt == can:
+                    data.at[i, 'wt_sequence_match'] = 'm'
+                elif wt != can:
+                    isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
                     for k in isoList:
                         if len(k) >= int(data.at[i, 'pos']):
                             resInIso = k[int(int(data.at[i, 'pos']) - 1)]
-                            wt = data.at[i, 'wt']
                             if wt == resInIso:
-                                whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[
-                                    0]
+                                whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
                                 data.at[i, 'wt_sequence_match'] = 'i'
                                 data.at[i, 'whichIsoform'] = whichIsoform
                                 break
 
-            data.wt_sequence_match = data.wt_sequence_match.astype('str')
-            data.replace({'': 'nan'}, inplace=True)
-            data_size = len(data.drop_duplicates(['datapoint']))
-            not_match_in_uniprot = data[(data.uniprotSequence == 'nan') | (data.wt_sequence_match == 'nan')]
-            uniprot_matched = data[(data.uniprotSequence != 'nan') & (data.wt_sequence_match != 'nan')]
-            data = None
-
-            print(
-                'You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
-                % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
-                   len(uniprot_matched.drop_duplicates(['datapoint']))))
-
-            """
-            STEP 5
-            Retrieve related PDB sequences, extract their sequences.
-            Add to the data frame.
-            """
-
-            pdb_fasta = pd.DataFrame(columns=['pdbID', 'chain', 'pdbSequence'])
-            pdb_info = pd.DataFrame(columns=['uniprotID', 'pdbID', 'chain', 'resolution'])
-
-            print('Retrieving PDB structures...\n')
-            pdbs = []
-            protein = uniprot_matched.uniprotID.to_list()
-            protein = list(set(protein))
-            
-
-            for prot in protein:
-                pdbs.append(get_pdb_ids(prot))
-            st.write(pdbs)
+            elif len(data.at[i, 'uniprotSequence']) < int(data.at[i, 'pos']):
+                isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
+                for k in isoList:
+                    if len(k) >= int(data.at[i, 'pos']):
+                        resInIso = k[int(int(data.at[i, 'pos']) - 1)]
+                        wt = data.at[i, 'wt']
+                        if wt == resInIso:
+                            whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
+                            data.at[i, 'wt_sequence_match'] = 'i'
+                            data.at[i, 'whichIsoform'] = whichIsoform
+                            break
+
+        data.wt_sequence_match = data.wt_sequence_match.astype('str')
+        data.replace({'': 'nan'}, inplace=True)
+        data_size = len(data.drop_duplicates(['datapoint']))
+        not_match_in_uniprot = data[(data.uniprotSequence == 'nan') | (data.wt_sequence_match == 'nan')]
+        uniprot_matched = data[(data.uniprotSequence != 'nan') & (data.wt_sequence_match != 'nan')]
+        data = None
+
+        print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
+              % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
+                 len(uniprot_matched.drop_duplicates(['datapoint']))))
+
+        """
+        STEP 5
+        Retrieve related PDB sequences, extract their sequences.
+        Add to the data frame.
+        """
+        from urllib.error import HTTPError
+        pdb_fasta = pd.DataFrame(columns=['pdbID', 'chain', 'pdbSequence'])
+        pdb_info = pd.DataFrame(columns=['uniprotID', 'pdbID', 'chain', 'resolution'])
+
+        print('Retrieving PDB structures...\n')
+        pdbs = []
+        protein = uniprot_matched.uniprotID.to_list()
+        protein = list(set(protein))
+
+        for prot in protein:
+            pdbs.append(get_pdb_ids(prot))
+        print('PDBs', pdbs)
+        if len(pdbs)>=1:
+            print('pdbs not empty')
             pdbs = [item for sublist in pdbs for item in sublist]
-            print('Processing PDB structures...\n')
-            if pdbs == []:
-                print('No PDB structure found for the query. ')
-
-            print('Starting PDB structures download...\n')
-            pdbs = list(filter(None, pdbs))
-            pdbs = (set(pdbs))
-            pdbs = [i.lower() for i in pdbs]
-            pdbl = PDBList()
-            parser = PDBParser()
-            index = 0
+            print('NEW', pdbs)
+        else:
+            print('pdbs empty')
+            pdbs =[]
+        print('Processing PDB structures...\n')
+        if pdbs == []:
+            print('No PDB structure found for the query. ')
+        """
+        try:
+            pdbs = [j.strip('[').strip(']').strip().strip('\'').strip('\"') for j in
+                    ((',').join([str(item) for item in pdbs])).split(',')]
+        except IndexError:
+            pdbs = []
+            print('No PDB structure found for the query. ')
+        """
+        print('Starting PDB structures download...\n')
+        pdbs = list(filter(None, pdbs))
+        pdbs = (set(pdbs))
+        pdbs = [i.lower() for i in pdbs]
+        pdbl = PDBList()
+        parser = PDBParser()
+        index = 0
+
+        try:
+            shutil.rmtree('obsolete')
+        except OSError as e:
+            pass
+        pdb_structures_path = path_to_output_files / 'log.txt'
+        existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
+        existing_pdb = [str(i) for i in existing_pdb]
+        existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
+        cnt = 0
+        for search in pdbs:
+            try:
+                if search.lower() not in existing_pdb:
+                    file = pdbl.retrieve_pdb_file(search, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb")
+                else:
+                    print('PDB structure file exists..')
+                    for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
+                        filename_replace_ext = filename.with_suffix(".pdb")
+                        filename.rename(filename_replace_ext)
+
+                    file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
+
+                    base = os.path.splitext(str(file))[0]
+                    base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
+                    os.rename(file, base + ".ent")
+                    file = base + '.ent'
+
+                resolution_method = parser.get_structure(search, file)
+                for record in SeqIO.parse(file, "pdb-seqres"):
+                    if record.dbxrefs[0].split(':')[0] == 'UNP':
+                        pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
+                        pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
+                        pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
+                        pdb_info.at[index, 'uniprotID'] = record.dbxrefs[0].split(':')[1]
+                        pdb_info.at[index, 'pdbID'] = record.id.split(':')[0]
+                        pdb_info.at[index, 'chain'] = record.annotations["chain"]
+                        pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
+                    index += 1
+            except:
+                IndexError
+                pdb_info.at[index, 'uniprotID'] = 'nan'
+                pdb_info.at[index, 'pdbID'] = 'nan'
+                pdb_info.at[index, 'chain'] = 'nan'
+                pdb_info.at[index, 'resolution'] = 'nan'
+            cnt +=1
+        print()
+        print('PDB file processing finished..')
+        for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
+            try:
+                filename_replace_ext = filename.with_suffix(".pdb")
+                filename.rename(filename_replace_ext)
+            except:
+                FileNotFoundError
 
+        for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
             try:
-                shutil.rmtree('obsolete')
-            except OSError as e:
-                pass
+                if filename.stem.startswith("pdb"):
+                    filename_replace_ext = filename.with_name(filename.stem[3:])
+                    filename.rename(filename_replace_ext.with_suffix('.pdb'))
+            except:
+                FileNotFoundError
+
+        uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
+        uniprot_matched = uniprot_matched.astype(str)
+        uniprot_matched = uniprot_matched.drop_duplicates()
+
+        uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
+        uniprot_matched = uniprot_matched.astype(str)
+
+        with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
+                (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
+                uniprot_matched.resolution != 'None'))].drop_duplicates()
+        no_pdb = uniprot_matched[(uniprot_matched.pdbID == 'nan') | (
+                (uniprot_matched.resolution == 'nan') | (uniprot_matched.resolution == 'OT') | (
+                uniprot_matched.resolution == 'None'))]
+        no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())]
+        no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True)
+
+        print(
+            'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
+            % (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])),
+               len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint']))))
+
+        with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
+        with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
+        with_pdb.replace({'': 'nan'}, inplace=True)
+
+        if len(with_pdb) == 0:
+            with_pdb['pdbInfo'] = ''
+        else:
+            for i in with_pdb.index:
+                try:
+                    res = str(with_pdb.at[i, 'resolution'])
+                    chain = with_pdb.at[i, 'chain']
+                    new = with_pdb.at[i, 'pdbID'] + ':' + chain + ':' + res
+                    with_pdb.at[i, 'pdbInfo'] = new
+                except:
+                    TypeError
+                    with_pdb.at[i, 'pdbInfo'] = 'nan'
 
-            existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*"))
-            existing_pdb = [str(i) for i in existing_pdb]
-            existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
+        with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
+                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
+                             'wt_sequence_match',
+                             'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
 
-            cnt = 0
-            for search in pdbs:
-                st.write('pdb',pdb)
-                try:
-                    if search.lower() not in existing_pdb:
-
-                        # Specify the URL of the PDB file you want to download
-                        pdb_url = f"https://files.rcsb.org/download/{search}.pdb"
-                        # Set the path within your Hugging Face space where you want to store the PDB files
-                        pdb_folder_path = Path(path_to_output_files / 'pdb_structures')
-                        st.write(pdb_folder_path)
-                        # Extract the PDB filename from the URL
-                        pdb_filename = pdb_url.split("/")[-1]
-
-                        # Set the path for the downloaded file
-                        pdb_file_path = os.path.join(pdb_folder_path, pdb_filename)
-
-                        # Send a GET request to download the PDB file
-                        response = requests.get(pdb_url)
-                        if response.status_code == 200:
-                            # Save the file to the specified path
-                            with open(pdb_file_path, "wb") as file:
-                                file.write(response.content)
-                            print("PDB file downloaded successfully!")
-                        else:
-                            print("Failed to download the PDB file.")
-
-                    else:
-                        print('PDB structure file exists..')
-                        for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
-                            filename_replace_ext = filename.with_suffix(".pdb")
-                            filename.rename(filename_replace_ext)
-
-                        file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
-
-                        base = os.path.splitext(str(file))[0]
-                        base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
-                        os.rename(file, base + ".ent")
-                        file = base + '.ent'
-
-                    # Parse the PDB file
-                    structure = parser.get_structure("structure", file)
-                    # Get the resolution from the Structure object
-                    resolution = structure.header["resolution"]
-
-                    for record in SeqIO.parse(file, "pdb-seqres"):
-                        if record.dbxrefs[0].split(':')[0] == 'UNP':
-                            pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
-                            pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
-                            pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
-                            pdb_info.at[index, 'uniprotID'] = record.dbxrefs[0].split(':')[1]
-                            pdb_info.at[index, 'pdbID'] = record.id.split(':')[0]
-                            pdb_info.at[index, 'chain'] = record.annotations["chain"]
-                            pdb_info.at[index, 'resolution'] = resolution
-                        index += 1
 
-                except:
-                    IndexError
-                    pdb_info.at[index, 'uniprotID'] = 'nan'
-                    pdb_info.at[index, 'pdbID'] = 'nan'
-                    pdb_info.at[index, 'chain'] = 'nan'
-                    pdb_info.at[index, 'resolution'] = 'nan'
-                    index += 1
-                cnt += 1
-            st.write('pdb_info')
-            
-            st.write(pdb_info)
-            print('PDB file processing finished..')
-            for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
-                try:
-                    filename_replace_ext = filename.with_suffix(".pdb")
-                    filename.rename(filename_replace_ext)
-                except:
-                    FileNotFoundError
 
-            for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
-                try:
-                    if filename.stem.startswith("pdb"):
-                        filename_replace_ext = filename.with_name(filename.stem[3:])
-                        filename.rename(filename_replace_ext.with_suffix('.pdb'))
-                except:
-                    FileNotFoundError
-
-            uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
-            uniprot_matched = uniprot_matched.astype(str)
-            uniprot_matched = uniprot_matched.drop_duplicates()
-
-            uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
-            uniprot_matched = uniprot_matched.astype(str)
-            st.write('uniprot_matched')
-            st.write(uniprot_matched)
-            with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
-                    (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
-                    uniprot_matched.resolution != 'None'))].drop_duplicates()
-            no_pdb = uniprot_matched[(uniprot_matched.pdbID == 'nan') | (
-                    (uniprot_matched.resolution == 'nan') | (uniprot_matched.resolution == 'OT') | (
-                    uniprot_matched.resolution == 'None'))]
-            no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())]
-            no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True)
-            st.write('with_pdb')
-            st.write(with_pdb)
-            print(
-                'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
-                % (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])),
-                   len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint']))))
-
-            with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
-            with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
-            with_pdb.replace({'': 'nan'}, inplace=True)
-
-            if len(with_pdb) == 0:
-                with_pdb['pdbInfo'] = ''
-            else:
-                for i in with_pdb.index:
-                    try:
-                        res = str(with_pdb.at[i, 'resolution'])
-                        chain = with_pdb.at[i, 'chain']
-                        new = with_pdb.at[i, 'pdbID'] + ':' + chain + ':' + res
-                        with_pdb.at[i, 'pdbInfo'] = new
-                    except:
-                        TypeError
-                        with_pdb.at[i, 'pdbInfo'] = 'nan'
-
-            with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                                 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
-                                 'wt_sequence_match',
-                                 'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
-
-            # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
-            # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
-            # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
-
-            """
-            STEP 6
-            Retrieve sequence annotations.
-            Add to the data frame.
-            """
-
-            if len(with_pdb) > 0:
-                with_pdb = add_annotations(with_pdb)
-            else:
-                new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant',
-                                                         'dnaBinding',
-                                                         'activeSite',
-                                                         'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
-                                                         'crosslink', 'mutagenesis', 'strand',
-                                                         'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain',
-                                                         'caBinding', 'bindingSite', 'region',
-                                                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
-                                                         'coiledCoil', 'peptide',
-                                                         'transitPeptide', 'glycosylation', 'propeptide',
-                                                         'disulfideBinary',
-                                                         'intMetBinary', 'intramembraneBinary',
-                                                         'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
-                                                         'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
-                                                         'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
-                                                         'strandBinary', 'helixBinary', 'turnBinary',
-                                                         'metalBindingBinary',
-                                                         'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
-                                                         'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
-                                                         'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
-                                                         'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
-                                                         'glycosylationBinary', 'propeptideBinary']
-                with_pdb = pd.DataFrame(columns=new_cols)
-            try:
-                with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
-            except:
-                AttributeError
-                with_pdb['whichIsoform'] = ''
-
-            with_pdb = with_pdb.astype(str)
-            with_pdb = with_pdb.replace({'NaN': 'nan'})
-            with_pdb.replace({'[]': 'nan'}, inplace=True)
-            with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
-            with_pdb.replace({'': 'nan'}, inplace=True)
-
-            """
-            STEP 7
-            Do alignment for PDB
-            """
-            # Canonical matches, i.e. labelled as m, canonical sequences will be aligned with PDB sequences.
-            # Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences.
-            with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C')
-            with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C')
-
-            dfM = with_pdb[with_pdb.wt_sequence_match == 'm']
-            dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
-            dfM = dfM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
-
-            dfNM = with_pdb[with_pdb.wt_sequence_match == 'i']
-            dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
-            dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
-            dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True)
-
-            dfM = dfM.astype(str)
-            dfNM = dfNM.astype(str)
-
-            dfM.reset_index(inplace=True)
-            dfM.drop(['index'], axis=1, inplace=True)
-            dfNM.reset_index(inplace=True)
-            dfNM.drop(['index'], axis=1, inplace=True)
-
-            uniprot_matched_size = len(uniprot_matched.drop_duplicates(['datapoint']))
-            uniprot_matched = None
-            pdb_fasta = None
-            pdb_info = None
-            pdbs = None
-            existing_pdb = None
-            with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
-            with_pdb = None
+        # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
+        # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
+        # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
 
-            print('Aligning sequences...\n')
+        """
+        STEP 6
+        Retrieve sequence annotations.
+        Add to the data frame.
+        """
 
-            aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
-            aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
-            # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
-            for i in aligned_m.index:
-                if aligned_m.at[i, 'pdbSequence'] == 'nan':
-                    aligned_m.at[i, 'mutationPositionOnPDB'] = 'nan'
-                    aligned_m.at[i, 'domainStartonPDB'] = 'nan'
-                    aligned_m.at[i, 'domainEndonPDB'] = 'nan'
-                    aligned_m.at[i, 'pdb_alignStatus'] = 'nan'
-
-            for i in aligned_nm.index:
-                if aligned_nm.at[i, 'pdbSequence'] == 'nan':
-                    aligned_nm.at[i, 'mutationPositionOnPDB'] = 'nan'
-                    aligned_nm.at[i, 'domainStartonPDB'] = 'nan'
-                    aligned_nm.at[i, 'domainEndonPDB'] = 'nan'
-                    aligned_nm.at[i, 'pdb_alignStatus'] = 'nan'
-
-            # Check if they the same column name before merging.
-            aligned_m = aligned_m.astype(str)
-            aligned_nm = aligned_nm.astype(str)
-
-            frames = [aligned_m, aligned_nm]
-            after_up_pdb_alignment = pd.concat(frames, sort=False)
-            if len(after_up_pdb_alignment) == 0:
-                after_up_pdb_alignment['pdb_alignStatus'] = ''
-                after_up_pdb_alignment['mutationPositionOnPDB'] = ''
-                after_up_pdb_alignment['domainStartonPDB'] = ''
-                after_up_pdb_alignment['domainEndonPDB'] = ''
-
-            after_up_pdb_alignment = after_up_pdb_alignment.sort_values(
-                by=['uniprotID', 'wt', 'mut', 'pos', 'pdb_alignStatus', 'resolution', 'chain'],
-                ascending=[True, True, True, True, True, True, True])
-
-            after_up_pdb_alignment = after_up_pdb_alignment.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos'],
-                                                                            keep='first')
-
-            after_up_pdb_alignment = after_up_pdb_alignment.astype('str')
-
-            pdb_aligned = after_up_pdb_alignment[
-                (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB != 'nan')]
-            yes_pdb_no_match = after_up_pdb_alignment[
-                (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
-            no_pdb = no_pdb.copy()
-
-            print('PDB matching is completed...\n')
-            print('SUMMARY')
-            print('-------')
-            print('%d data points that failed to match a UniProt Sequence are discarded.' % len(
-                not_match_in_uniprot.drop_duplicates(['datapoint'])))
-            print('Of the remaining %d:' % uniprot_matched_size)
-            print('--%d of %d successfully aligned with PDB structures.' % (
-                len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size))
-            print('--%d of %d not found on the covered area by the structure.' % (
-                len(yes_pdb_no_match.drop_duplicates(['datapoint'])), with_pdb_size))
-            print('--PDB structures not found for %d datapoints.' % len(no_pdb.drop_duplicates(['datapoint'])))
-            print('--%d will be searched in Swiss-Model database.\n' % (
-                    len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
-
-            dfM = None
-            dfNM = None
-            aligned_nm = None
-            aligned_m = None
-            after_up_pdb_alignment = None
-
-            print('Proceeding to  SwissModel search...')
-            print('------------------------------------\n')
-
-            # At this point we have 4 dataframes
-            # 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well.
-            # 1a. aligned --- we are done with this.
-            # 1b. yes_pdb_no_match --- They have PDB structures but not matched, so will be searched in the other databases.
-            # 2. not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
-            # 3. no_pdb --- No PDB structures were found for them. Will be searched in other databases.
-
-            """
-            Step 8
-            Neutralize data points that are to be searched in Swiss-Model
-            # One point is that yes_pdb_no_match's annotations are the adjusted according to the PDBs they are matched before.
-            # They need to be converted to their old original UniProt annotation positions.
-            """
-            yes_pdb_no_match.drop(['disulfide', 'intMet',
-                                   'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                                   'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
-                                   'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                   'caBinding', 'topologicalDomain', 'bindingSite', 'region',
-                                   'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                                   'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary',
-                                   'intMetBinary', 'intramembraneBinary',
-                                   'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
-                                   'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
-                                   'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
-                                   'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
-                                   'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
-                                   'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
-                                   'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
-                                   'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
-                                   'glycosylationBinary', 'propeptideBinary', 'pdbSequence', 'pdbInfo', 'pdbID',
-                                   'chain', 'resolution', 'pdb_alignStatus', 'mutationPositionOnPDB',
-                                   'domainStartonPDB', 'domainEndonPDB'], axis=1, inplace=True)
-
-            to_swiss = pd.concat(
-                [yes_pdb_no_match.drop_duplicates(['datapoint']), no_pdb.drop_duplicates(['datapoint'])])
-            no_pdb = None
-            to_swiss.reset_index(inplace=True)
-            to_swiss.drop(['index'], axis=1, inplace=True)
-            to_swiss = to_swiss.astype('str')
-            to_swiss = to_swiss.replace({'NaN': 'nan'})
-            # Create model summary dataframe.
-            if len(to_swiss) != 0:
-                # import zipfile
-                # with zipfile.ZipFile(Path(path_to_input_files / 'swissmodel_structures.txt.zip'),"r") as zip_ref:
-                #    zip_ref.extractall(Path(path_to_input_files))
-
-                print('Generating SwissModel file...\n')
-
-                swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
-                                          dtype=str, header=None, skiprows=1,
-                                          names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
-                                                 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean',
-                                                 'qmean_norm', 'seqid', 'url'])
+        if len(with_pdb) > 0:
+            with_pdb = add_annotations(with_pdb)
+        else:
+            new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
+                                                     'activeSite',
+                                                     'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
+                                                     'crosslink', 'mutagenesis', 'strand',
+                                                     'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain',
+                                                     'caBinding', 'bindingSite', 'region',
+                                                     'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
+                                                     'coiledCoil', 'peptide',
+                                                     'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary',
+                                                     'intMetBinary', 'intramembraneBinary',
+                                                     'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                                                     'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                                                     'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                                                     'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                                                     'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                                                     'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                                                     'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                                                     'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                                                     'glycosylationBinary', 'propeptideBinary']
+            with_pdb = pd.DataFrame(columns = new_cols)
+        try:
+            with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
+        except:
+            AttributeError
+            with_pdb['whichIsoform'] = ''
+
+        with_pdb = with_pdb.astype(str)
+        with_pdb = with_pdb.replace({'NaN': 'nan'})
+        with_pdb.replace({'[]': 'nan'}, inplace=True)
+        with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
+        with_pdb.replace({'': 'nan'}, inplace=True)
+
+        """
+        STEP 7
+        Do alignment for PDB
+        """
+        # Canonical matches, i.e. labelled as m, canonical sequences will be aligned with PDB sequences.
+        # Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences.
+        with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C')
+        with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C')
+
+        dfM = with_pdb[with_pdb.wt_sequence_match == 'm']
+        dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
+        dfM = dfM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
+
+        dfNM = with_pdb[with_pdb.wt_sequence_match == 'i']
+        dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
+        dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
+        dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True)
+
+        dfM = dfM.astype(str)
+        dfNM = dfNM.astype(str)
+
+        dfM.reset_index(inplace=True)
+        dfM.drop(['index'], axis=1, inplace=True)
+        dfNM.reset_index(inplace=True)
+        dfNM.drop(['index'], axis=1, inplace=True)
+
+        uniprot_matched_size = len(uniprot_matched.drop_duplicates(['datapoint']))
+        uniprot_matched = None
+        pdb_fasta = None
+        pdb_info = None
+        pdbs = None
+        existing_pdb = None
+        with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
+        with_pdb = None
+
+        print('Aligning sequences...\n')
+        aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
+        aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
+
+        # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
+        for i in aligned_m.index:
+            if aligned_m.at[i, 'pdbSequence'] == 'nan':
+                aligned_m.at[i, 'mutationPositionOnPDB'] = 'nan'
+                aligned_m.at[i, 'domainStartonPDB'] = 'nan'
+                aligned_m.at[i, 'domainEndonPDB'] = 'nan'
+                aligned_m.at[i, 'pdb_alignStatus'] = 'nan'
+
+        for i in aligned_nm.index:
+            if aligned_nm.at[i, 'pdbSequence'] == 'nan':
+                aligned_nm.at[i, 'mutationPositionOnPDB'] = 'nan'
+                aligned_nm.at[i, 'domainStartonPDB'] = 'nan'
+                aligned_nm.at[i, 'domainEndonPDB'] = 'nan'
+                aligned_nm.at[i, 'pdb_alignStatus'] = 'nan'
+
+        # Check if they the same column name before merging.
+        aligned_m = aligned_m.astype(str)
+        aligned_nm = aligned_nm.astype(str)
+
+
+        frames = [aligned_m, aligned_nm]
+        after_up_pdb_alignment = pd.concat(frames, sort=False)
+        if len(after_up_pdb_alignment) == 0:
+            after_up_pdb_alignment['pdb_alignStatus'] = ''
+            after_up_pdb_alignment['mutationPositionOnPDB'] = ''
+            after_up_pdb_alignment['domainStartonPDB'] = ''
+            after_up_pdb_alignment['domainEndonPDB'] = ''
+
+        after_up_pdb_alignment = after_up_pdb_alignment.sort_values(
+            by=['uniprotID', 'wt', 'mut', 'pos', 'pdb_alignStatus', 'resolution', 'chain'],
+            ascending=[True, True, True, True, True, True, True])
+
+        after_up_pdb_alignment = after_up_pdb_alignment.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos'], keep='first')
+
+        after_up_pdb_alignment = after_up_pdb_alignment.astype('str')
+
+        pdb_aligned = after_up_pdb_alignment[
+            (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB != 'nan')]
+        yes_pdb_no_match = after_up_pdb_alignment[
+            (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
+        no_pdb = no_pdb.copy()
+
+
+        print('PDB matching is completed...\n')
+        print('SUMMARY')
+        print('-------')
+        print('%d data points that failed to match a UniProt Sequence are discarded.' % len(
+            not_match_in_uniprot.drop_duplicates(['datapoint'])))
+        print('Of the remaining %d:' % uniprot_matched_size)
+        print('--%d of %d successfully aligned with PDB structures.' % (
+            len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size))
+        print('--%d of %d not found on the covered area by the structure.' % (
+            len(yes_pdb_no_match.drop_duplicates(['datapoint'])), with_pdb_size))
+        print('--PDB structures not found for %d datapoints.' % len(no_pdb.drop_duplicates(['datapoint'])))
+        print('--%d will be searched in Swiss-Model database.\n' % (
+                len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
+
+
+        dfM = None
+        dfNM = None
+        aligned_nm = None
+        aligned_m = None
+        after_up_pdb_alignment = None
+
+        print('Proceeding to  SwissModel search...')
+        print('------------------------------------\n')
+
+        # At this point we have 4 dataframes
+        # 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well.
+        # 1a. aligned --- we are done with this.
+        # 1b. yes_pdb_no_match --- They have PDB structures but not matched, so will be searched in the other databases.
+        # 2. not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
+        # 3. no_pdb --- No PDB structures were found for them. Will be searched in other databases.
+
+        """
+        Step 8
+        Neutralize data points that are to be searched in Swiss-Model
+        # One point is that yes_pdb_no_match's annotations are the adjusted according to the PDBs they are matched before.
+        # They need to be converted to their old original UniProt annotation positions.
+        """
+        yes_pdb_no_match.drop(['disulfide', 'intMet',
+                               'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                               'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                               'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                               'caBinding', 'topologicalDomain', 'bindingSite', 'region',
+                               'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                               'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary',
+                               'intMetBinary', 'intramembraneBinary',
+                               'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                               'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                               'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                               'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                               'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                               'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                               'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                               'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                               'glycosylationBinary', 'propeptideBinary', 'pdbSequence', 'pdbInfo', 'pdbID',
+                               'chain', 'resolution', 'pdb_alignStatus', 'mutationPositionOnPDB',
+                               'domainStartonPDB', 'domainEndonPDB'], axis=1, inplace=True)
+
+        to_swiss = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']), no_pdb.drop_duplicates(['datapoint'])])
+        no_pdb = None
+        to_swiss.reset_index(inplace=True)
+        to_swiss.drop(['index'], axis=1, inplace=True)
+        to_swiss = to_swiss.astype('str')
+        to_swiss = to_swiss.replace({'NaN': 'nan'})
+        # Create model summary dataframe.
+        if len(to_swiss) != 0:
+            print('Generating SwissModel file...\n')
+
+            swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
+                                      dtype=str, header=None, skiprows=1,
+                                      names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
+                                             'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm','seqid', 'url'])
 
+        else:
+            swiss_model = pd.DataFrame(
+                columns=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', 'coordinate_id',
+                         'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm', 'seqid', 'url', 'whichIsoform'])
+        swiss_model = swiss_model.astype('str')
+        try:
+            swiss_model.iso_id = swiss_model.iso_id.astype('str')
+        except:
+            AttributeError
+            swiss_model['iso_id'] = 'nan'
+        swiss_model = swiss_model[swiss_model.UniProtKB_ac != 'nan']
+        for ind in swiss_model.index:
+            swiss_model.at[ind, 'UniProtKB_ac'] = swiss_model.at[ind, 'UniProtKB_ac'].split('-')[0]
+            if swiss_model.at[ind, 'iso_id'] != 'nan':
+
+                swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
             else:
-                swiss_model = pd.DataFrame(
-                    columns=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', 'coordinate_id',
-                             'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm', 'seqid', 'url',
-                             'whichIsoform'])
-            swiss_model = swiss_model.astype('str')
+                swiss_model.at[ind, 'whichIsoform'] = 'nan'
+#        swiss_model.drop(['input'], axis=1, inplace=True)
+        swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
+        print('Index File Processed...\n')
+
+
+        # Get relevant columns
+        swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
+        # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
+        swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
+        swiss_model.reset_index(inplace=True)
+        swiss_model.drop(['index'], axis=1, inplace=True)
+
+        # Get protein IDs for which there exist models.
+        swiss_model_ids = set(swiss_model.UniProtKB_ac.to_list())
+        to_swiss = to_swiss.astype(str)
+        no_swiss_models = pd.DataFrame()
+        for i in to_swiss.index:
+            if to_swiss.at[i, 'uniprotID'] not in swiss_model_ids:
+                k = pd.Series(to_swiss.iloc[i])
+                no_swiss_models = no_swiss_models.append(k, ignore_index=True)
+
+        no_swiss_models = no_swiss_models.astype(str)
+        if len(no_swiss_models) == 0:
+            no_swiss_models = pd.DataFrame(columns=to_swiss.columns)
+        else:
+            no_swiss_models = no_swiss_models[to_swiss.columns]
+            no_swiss_models.reset_index(inplace=True)
+            no_swiss_models.drop('index', axis=1, inplace=True)
+
+        with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False)
+        with_swiss_models = with_swiss_models[to_swiss.columns]
+
+        # Add model info.
+
+        with_swiss_models = with_swiss_models.astype(str)
+        swiss_model = swiss_model.astype(str)
+        swiss_models_with_data = pd.merge(with_swiss_models, swiss_model, left_on=['uniprotID', 'whichIsoform'],
+                                          right_on=['UniProtKB_ac', 'whichIsoform'],
+                                          how='left')
+        swiss_models_with_data = swiss_models_with_data.astype(str)
+        swiss_models_with_data = swiss_models_with_data.sort_values(by=['uniprotID', 'wt', 'mut', 'pos', 'qmean_norm'],
+                                                                    ascending=False)
+        swiss_models_with_data = swiss_models_with_data.drop_duplicates()
+        swiss_models_with_data = swiss_models_with_data.drop(['UniProtKB_ac', 'seqid'], axis=1)
+        swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
+        swiss_models_with_data = swiss_models_with_data.astype(str)
+
+        # Get the ones in the list but without model url and add to the list to go to modbase.
+        url_nan = swiss_models_with_data[swiss_models_with_data.url == 'nan']
+
+        # Add this nan's to no_model. These will be searched in MODBASE because here they dont have urls.
+        url_nan = url_nan.drop(['from', 'qmean_norm', 'template', 'to', 'url'], axis=1)
+
+        no_swiss_models_2 = pd.concat([no_swiss_models, url_nan])
+        swiss_models_with_data = swiss_models_with_data[swiss_models_with_data.url != 'nan']
+        for i in swiss_models_with_data.index:
             try:
-                swiss_model.iso_id = swiss_model.iso_id.astype('str')
+                swiss_models_with_data.at[i, 'chain'] = swiss_models_with_data.at[i, 'template'].split('.')[2]
+                swiss_models_with_data.at[i, 'template'] = swiss_models_with_data.at[i, 'template'].split('.')[0]
             except:
-                AttributeError
-                swiss_model['iso_id'] = 'nan'
-            swiss_model = swiss_model[swiss_model.UniProtKB_ac != 'nan']
-            for ind in swiss_model.index:
-                swiss_model.at[ind, 'UniProtKB_ac'] = swiss_model.at[ind, 'UniProtKB_ac'].split('-')[0]
-                if swiss_model.at[ind, 'iso_id'] != 'nan':
-
-                    swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
-                else:
-                    swiss_model.at[ind, 'whichIsoform'] = 'nan'
-            #        swiss_model.drop(['input'], axis=1, inplace=True)
-            swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
-            print('Index File Processed...\n')
-
-            # Get relevant columns
-            swiss_model = swiss_model[
-                ['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
-            # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
-            swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
-            swiss_model.reset_index(inplace=True)
-            swiss_model.drop(['index'], axis=1, inplace=True)
-
-            # Get protein IDs for which there exist models.
-            swiss_model_ids = set(swiss_model.UniProtKB_ac.to_list())
-            to_swiss = to_swiss.astype(str)
-            no_swiss_models = pd.DataFrame()
-            for i in to_swiss.index:
-                if to_swiss.at[i, 'uniprotID'] not in swiss_model_ids:
-                    k = pd.Series(to_swiss.iloc[i])
-                    no_swiss_models = no_swiss_models.append(k, ignore_index=True)
-
-            no_swiss_models = no_swiss_models.astype(str)
-            if len(no_swiss_models) == 0:
-                no_swiss_models = pd.DataFrame(columns=to_swiss.columns)
+                IndexError
+        if len(swiss_models_with_data) == 0:
+            swiss_models_with_data['chain'] = ''
+            swiss_models_with_data['template'] = ''
+
+        swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('str')
+        swiss_models_with_data.chain = swiss_models_with_data.chain.astype('str')
+        swiss_models_with_data['qmean_norm'] = swiss_models_with_data.qmean_norm.apply(lambda x: round(float(x), 2))
+        swiss_models_with_data = swiss_models_with_data.astype(str)
+
+        # swiss_models_with_data: These data points will be aligned with their corresponding model sequences.
+        # Add sequences
+
+        no_swiss_models_2.reset_index(inplace=True)
+        no_swiss_models_2.drop('index', axis=1, inplace=True)
+
+        swiss_models_with_data.reset_index(inplace=True)
+        swiss_models_with_data.drop('index', axis=1, inplace=True)
+
+        swiss_model_ids = None
+        with_swiss_models = None
+        swiss_model = None
+        no_swiss_models = None
+        url_nan = None
+
+        # At this point we have:
+        # pdb_aligned --- Align in the PDB phase
+        # not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
+        # to_swiss (no_pdb + yes_pdb_no_match) --- to be searched in SwissModel database
+        # to_swiss (with_swiss_models & no_swiss_models)
+        # swiss_models_with_data --- We found swiss models for them.
+        # no_swiss_models_2 (no_swiss_models + url_nan)--- to be searched in modbase (the ones having swissmodels but not matching with the boundaries  & broken_swiss will be added here)
+
+        """
+        STEP 9
+        Associated model IDs are added. 
+        Download model files.
+        """
+        print('Beginning SwissModel files download...')
+        existing_swiss = list(Path(path_to_output_files / 'swissmodel_structures').glob("*"))
+        existing_swiss = [str(i) for i in existing_swiss]
+        existing_swiss = ['.'.join(i.split('/')[-1].split('.')[:-1]) for i in existing_swiss]
+        swissmodels_fasta = pd.DataFrame()
+
+        for i in swiss_models_with_data.index:
+            protein = swiss_models_with_data.at[i, 'uniprotID']
+            template = swiss_models_with_data.at[i, 'template'].split('.')[0]
+            qmean_norm = str(round(float(swiss_models_with_data.at[i, 'qmean_norm']), 2))
+            if protein + '_' + template + '_' + qmean_norm not in existing_swiss:
+                url = swiss_models_with_data.at[i, 'url'].strip('\"').strip('}').replace('\\', '').strip('\"').replace(
+                    'https',
+                    'https:')
+                req = requests.get(url)
+                name = Path(path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt')
+                print('Downloading for Protein:', protein + ' Model: ' + template)
+                with open(name, 'wb') as f:
+                    f.write(req.content)
             else:
-                no_swiss_models = no_swiss_models[to_swiss.columns]
-                no_swiss_models.reset_index(inplace=True)
-                no_swiss_models.drop('index', axis=1, inplace=True)
-
-            with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False)
-            with_swiss_models = with_swiss_models[to_swiss.columns]
-
-            # Add model info.
-
-            with_swiss_models = with_swiss_models.astype(str)
-            swiss_model = swiss_model.astype(str)
-            swiss_models_with_data = pd.merge(with_swiss_models, swiss_model, left_on=['uniprotID', 'whichIsoform'],
-                                              right_on=['UniProtKB_ac', 'whichIsoform'],
-                                              how='left')
-            swiss_models_with_data = swiss_models_with_data.astype(str)
-            swiss_models_with_data = swiss_models_with_data.sort_values(
-                by=['uniprotID', 'wt', 'mut', 'pos', 'qmean_norm'],
-                ascending=False)
-            swiss_models_with_data = swiss_models_with_data.drop_duplicates()
-            swiss_models_with_data = swiss_models_with_data.drop(['UniProtKB_ac', 'seqid'], axis=1)
-            swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
-            swiss_models_with_data = swiss_models_with_data.astype(str)
-
-            # Get the ones in the list but without model url and add to the list to go to modbase.
-            url_nan = swiss_models_with_data[swiss_models_with_data.url == 'nan']
-
-            # Add this nan's to no_model. These will be searched in MODBASE because here they dont have urls.
-            url_nan = url_nan.drop(['from', 'qmean_norm', 'template', 'to', 'url'], axis=1)
-
-            no_swiss_models_2 = pd.concat([no_swiss_models, url_nan])
-            swiss_models_with_data = swiss_models_with_data[swiss_models_with_data.url != 'nan']
-            for i in swiss_models_with_data.index:
-                try:
-                    swiss_models_with_data.at[i, 'chain'] = swiss_models_with_data.at[i, 'template'].split('.')[2]
-                    swiss_models_with_data.at[i, 'template'] = swiss_models_with_data.at[i, 'template'].split('.')[0]
-                except:
-                    IndexError
-            if len(swiss_models_with_data) == 0:
-                swiss_models_with_data['chain'] = ''
-                swiss_models_with_data['template'] = ''
-
-            swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('str')
-            swiss_models_with_data.chain = swiss_models_with_data.chain.astype('str')
-            swiss_models_with_data['qmean_norm'] = swiss_models_with_data.qmean_norm.apply(lambda x: round(float(x), 2))
-            swiss_models_with_data = swiss_models_with_data.astype(str)
-
-            # swiss_models_with_data: These data points will be aligned with their corresponding model sequences.
-            # Add sequences
-
-            no_swiss_models_2.reset_index(inplace=True)
-            no_swiss_models_2.drop('index', axis=1, inplace=True)
-
-            swiss_models_with_data.reset_index(inplace=True)
-            swiss_models_with_data.drop('index', axis=1, inplace=True)
-
-            swiss_model_ids = None
-            with_swiss_models = None
-            swiss_model = None
-            no_swiss_models = None
-            url_nan = None
-
-            # At this point we have:
-            # pdb_aligned --- Align in the PDB phase
-            # not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
-            # to_swiss (no_pdb + yes_pdb_no_match) --- to be searched in SwissModel database
-            # to_swiss (with_swiss_models & no_swiss_models)
-            # swiss_models_with_data --- We found swiss models for them.
-            # no_swiss_models_2 (no_swiss_models + url_nan)--- to be searched in modbase (the ones having swissmodels but not matching with the boundaries  & broken_swiss will be added here)
-
-            """
-            STEP 9
-            Associated model IDs are added. 
-            Download model files.
-            """
-            print('Beginning SwissModel files download...')
-            existing_swiss = list(Path(path_to_output_files / 'swissmodel_structures').glob("*"))
-            existing_swiss = [str(i) for i in existing_swiss]
-            existing_swiss = ['.'.join(i.split('/')[-1].split('.')[:-1]) for i in existing_swiss]
-            swissmodels_fasta = pd.DataFrame()
-
-            for i in swiss_models_with_data.index:
-                protein = swiss_models_with_data.at[i, 'uniprotID']
-                template = swiss_models_with_data.at[i, 'template'].split('.')[0]
-                qmean_norm = str(round(float(swiss_models_with_data.at[i, 'qmean_norm']), 2))
-                if protein + '_' + template + '_' + qmean_norm not in existing_swiss:
-                    url = swiss_models_with_data.at[i, 'url'].strip('\"').strip('}').replace('\\', '').strip(
-                        '\"').replace(
-                        'https',
-                        'https:')
+                print('Model exists.')
+                name = Path(path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt')
+            with open(name, encoding="utf8") as f:
+                fasta = ''
+                lines = f.readlines()
+                chain = ''
+                for row in lines:
+                    if row[0:4] == 'ATOM' and row[13:15] == 'CA':
+                        chain = row[20:22].strip()
+                        fasta += threeToOne(row[17:20])
+                    if row[0:3] == 'TER':
+                        k = pd.Series([protein, template, qmean_norm, chain.upper(), fasta])
+                        swissmodels_fasta = swissmodels_fasta.append(k, ignore_index=True)
+                        fasta = ''
+
+        if len(swissmodels_fasta) == 0:
+            swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'])
+        else:
+            swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']
+
+        swissmodels_fasta = swissmodels_fasta.astype(str)
+
+        swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float)
+        swissmodels_fasta.qmean_norm = swissmodels_fasta.qmean_norm.astype(float)
+
+        swissmodels_fasta = swissmodels_fasta.sort_values(['uniprotID', 'template', 'qmean_norm', 'chain'],
+                                                          axis=0)  # example = 3gdh
+        swissmodels_fasta.reset_index(inplace=True)
+        swissmodels_fasta.drop(['index'], axis=1, inplace=True)
+        swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'qmean_norm', 'chain'])
+        swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'chain', 'fasta'])
+        swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'fasta'])
+        # Some files were broken, thus their PDBs couldnt be recorded.
+        swissmodels_fasta = swissmodels_fasta.drop_duplicates()
+        swissmodels_fasta = swissmodels_fasta.astype(str)
+
+        swiss_models_with_data = swiss_models_with_data.astype(str)
+        swissmodels_fasta = swissmodels_fasta.astype(str)
+        swiss_models_with_data1 = swiss_models_with_data.merge(swissmodels_fasta,
+                                                               on=['uniprotID', 'template', 'qmean_norm', 'chain'])
+
+        swiss_models_with_data1 = swiss_models_with_data1.sort_values(['datapoint', 'fasta'], axis=0,
+                                                                      ascending=[True, False])
+        swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
+
+
+        swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
+        swiss_models_with_data.reset_index(inplace=True)
+        swiss_models_with_data.drop(['index'], axis=1, inplace=True)
+        broken_swiss = pd.DataFrame()
+        c = 0
+        for i in swiss_models_with_data.index:  # en baştaki dfde var ama model gelende yok.
+            if swiss_models_with_data.at[i, 'datapoint'] not in swiss_models_with_data1_dp:
+                k = pd.Series(swiss_models_with_data.iloc[i])
+                broken_swiss = broken_swiss.append(k, ignore_index=True)
+                c += 1
+
+        if len(broken_swiss) == 0:
+            broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list())
+
+        swiss_models_with_data = swiss_models_with_data1.copy()
+
+
+        swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
+        swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
+                                                                    axis=0, ascending=[True, True, True, False])
+
+        # Delete the same model sequence with lower quality
+        swiss_models_with_data = swiss_models_with_data.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
+                                                                        keep='first')
+        swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
+        swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
+        len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(broken_swiss.drop_duplicates(['datapoint'])) + len(
+            no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
+        # This printed data here includes all possible models with different qualities,
+        # because we may get a hit in either of them.
+        swiss_models_with_data.rename({'fasta': 'pdbSequence'}, axis=1, inplace=True)  # for convenience.
+
+        # NOW DO ALIGNMENT HERE
+
+        swiss_models_with_data = swiss_models_with_data.replace({'[\'?\']': 'nan'})
+        swiss_models_with_data = swiss_models_with_data.replace({'[]': 'nan'})
+        swiss_models_with_data.rename({'template': 'pdbID'}, axis=1,
+                                      inplace=True)  # Only to be able use the alignment code above.
+        swiss_models_with_data = swiss_models_with_data.astype(str)
+        swiss_models_with_data.pdbSequence = swiss_models_with_data.pdbSequence.astype('str')
+        swiss_models_with_data = add_annotations(swiss_models_with_data)
+        swiss_models_with_data = swiss_models_with_data.astype(str)
+        swiss_models_with_data.replace({'NaN': 'nan'}, inplace=True)
+        swiss_models_with_data_copy = swiss_models_with_data.copy()
+        swiss_models_with_data1_dp = None
+        swiss_models_with_data1 = None
+        existing_swiss = None
+        swissmodels_fasta = None
+
+        print('Aligning sequences...\n')
+
+        swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
+        swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
+        swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, path_to_output_files / 'alignment_files')
+        swiss_models_with_data = None
+
+
+        if len(swiss_model_aligned) == 0:
+            swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
+            swiss_model_aligned['qmean_norm'] = 'nan'
+        else:
+            swiss_model_aligned = swiss_model_aligned.astype(str)
+            swiss_model_aligned.replace({'NaN': 'nan'}, inplace=True)
+
+        # Some datapoints appear in both nan and not_nan. If not_nan we take it only once.
+        nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB == 'nan']
+        not_nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB != 'nan']
+        not_nan.qmean_norm = not_nan.qmean_norm.astype('float')
+        not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'qmean_norm'], ascending=[True, True, False], inplace=True)
+
+        which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
+        swiss_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan']
+        swiss_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan']
+
+        swiss_match.qmean_norm = swiss_match.qmean_norm.astype('float')
+        swiss_match.sort_values(['uniprotID', 'wt', 'pos', 'mut', 'pdb_alignStatus', 'qmean_norm'],
+                                ascending=[True, True, True, True, True, False], inplace=True)
+        swiss_match.drop_duplicates(['uniprotID', 'wt', 'pos', 'mut'], keep='first', inplace=True)
+        swiss_not_match = swiss_not_match[no_swiss_models_2.columns]
+        broken_swiss = broken_swiss[no_swiss_models_2.columns]
+        swiss_not_match = swiss_not_match.drop_duplicates(['datapoint'])
+        broken_swiss = broken_swiss.drop_duplicates(['datapoint'])
+
+        to_modbase = pd.concat([no_swiss_models_2, broken_swiss]).drop_duplicates()
+        to_modbase = pd.concat([to_modbase, swiss_not_match]).drop_duplicates()
+        to_modbase = to_modbase.astype(str)
+        to_swiss_columns = to_swiss.columns
+        to_swiss_size = len(to_swiss.drop_duplicates(['datapoint']))
+        to_swiss = None
+
+        # CONTROL
+
+        """
+        # This should be the whole data.
+        len(swiss_match.drop_duplicates(['datapoint'])) + len(aligned.drop_duplicates(['datapoint'])) + len(to_modbase.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) ,len(data)
+        len(aligned.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) +len(to_swiss.drop_duplicates(['datapoint']))== len(data)
+        """
+        print('SwissModel matching is completed...\n')
+        print('SUMMARY')
+        print('-------')
+        print('%d data points that failed to match a UniProt Sequence are discarded.' % len(
+            not_match_in_uniprot.drop_duplicates(['datapoint'])))
+        print('Of the remaining %d:' % uniprot_matched_size)
+        print('--%d of %d successfully aligned with PDB structures.' % (
+            len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size))
+        print('--%d of %d successfully aligned with SwissModels structures.' % (
+            len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size))
+        print('--%d will be searched in ModBase database.\n' % len(to_modbase.drop_duplicates(['datapoint'])))
+
+        print('Proceeding to ModBase search...')
+        print('------------------------------------\n')
+        no_swiss_models_2 = None
+        broken_swiss = None
+        swiss_model_aligned = None
+        nan = None
+        not_nan = None
+        which_ones_are_match = None
+        swiss_not_match = None
+
+        # STEP :  GO TO MODBASE
+        # Should not include anything related to prev models.
+        if len(to_modbase) != 0:
+            to_modbase = to_modbase.astype(str)
+
+            # GET MODBASE MODELS
+
+            # Get IDs from data to retrieve only their models from MODBASE
+            to_modbase.reset_index(inplace=True)
+            to_modbase.drop(['index'], axis=1, inplace=True)
+
+            existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*"))
+            existing_modbase_models = [str(i) for i in existing_modbase_models]
+            existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models]
+
+            existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*"))
+            existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind]
+            existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind]
+
+            modbase_reduced = pd.DataFrame()
+            modbase_fasta = pd.DataFrame()
+
+            print('Retrieving ModBase models...\n')
+            # Get model files associated with each UniProtID
+            for protein in list(set(to_modbase.uniprotID.to_list())):
+                if protein not in existing_modbase_models:
+                    print('Downloading Modbase models for ', protein)
+                    url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
+                    print(url)
                     req = requests.get(url)
-                    name = Path(
-                        path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt')
-                    print('Downloading for Protein:', protein + ' Model: ' + template)
+                    name = path_to_output_files / 'modbase_structures' /  f'{protein}.txt'
                     with open(name, 'wb') as f:
                         f.write(req.content)
                 else:
-                    print('Model exists.')
-                    name = Path(
-                        path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt')
+                    print('Model exists for', protein)
+                    name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt')
                 with open(name, encoding="utf8") as f:
-                    fasta = ''
-                    lines = f.readlines()
-                    chain = ''
-                    for row in lines:
-                        if row[0:4] == 'ATOM' and row[13:15] == 'CA':
-                            chain = row[20:22].strip()
-                            fasta += threeToOne(row[17:20])
-                        if row[0:3] == 'TER':
-                            k = pd.Series([protein, template, qmean_norm, chain.upper(), fasta])
-                            swissmodels_fasta = swissmodels_fasta.append(k, ignore_index=True)
+                    a = open(name, 'r').read()
+                    soup = BeautifulSoup(a, 'lxml')
+                    for pdb in soup.findAll('pdbfile'):
+                        model_id = str(pdb.contents[1])[10:-11]
+                        if model_id not in existing_modbase_models_ind:
+                            with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w',
+                                      encoding="utf8") as individual:
+                                individual.write(str('UniProt ID: ' + protein))
+                                individual.write('\n')
+                                individual.write(str(pdb.contents[3])[10:-11].strip())
+                        with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt',
+                                  encoding="utf8") as f:
                             fasta = ''
+                            chain = ''
+                            template_chain = ''
+                            score = -999
+                            for ind_line in f.readlines():
+                                if ind_line[0:10] == 'UniProt ID':
+                                    uniprot_id = ind_line.split(':')[1].strip()
+                                if ind_line[0:23] == 'REMARK 220 TARGET BEGIN':
+                                    target_begin = ind_line[40:43].strip()
+                                if ind_line[0:21] == 'REMARK 220 TARGET END':
+                                    target_end = ind_line[40:43].strip()
+                                if ind_line[0:25] == 'REMARK 220 TEMPLATE BEGIN':
+                                    pdb_begin = ind_line[40:43].strip()
+                                if ind_line[0:23] == 'REMARK 220 TEMPLATE END':
+                                    pdb_end = ind_line[40:43].strip()
+                                if ind_line[0:23] == 'REMARK 220 TEMPLATE PDB':
+                                    pdb_code = ind_line[40:43].strip()
+                                if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN':
+                                    pdb_chain = ind_line[40:43].strip()
+                                if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score':
+                                    quality_score = ind_line[40:].strip()
+                                if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID':
+                                    model_id = ind_line[40:].strip()
+                                if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN':
+                                    template_chain = ind_line[40:42].strip()
+                                if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA':
+                                    fasta += threeToOne(ind_line[17:20])
+                                if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score':
+                                    try:
+                                        score = ind_line[40:].strip()
+                                    except (ValueError):
+                                        score = -999
+                                if ind_line[0:3] == 'TER' or ind_line[0:3] == 'END':
+                                    k = pd.Series([uniprot_id, model_id, str(score), template_chain, fasta])
+                                    modbase_fasta = modbase_fasta.append(k, ignore_index=True)
+                                    fasta = ''
+                            try:
+                                k = pd.Series(
+                                    [uniprot_id, target_begin, target_end, pdb_code, pdb_chain, pdb_begin, pdb_end,
+                                     quality_score,
+                                     model_id])
+                                modbase_reduced = modbase_reduced.append(k, ignore_index=True)
+                            except:
+                                NameError
+                                print('This file doesnt have Quality Score. Replacer: -999', model_id)
+                                quality_score = -999
 
-            if len(swissmodels_fasta) == 0:
-                swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'])
+            print()
+            if len(modbase_fasta) != 0:
+                modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta']
             else:
-                swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']
-
-            swissmodels_fasta = swissmodels_fasta.astype(str)
-
-            swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float)
-            swissmodels_fasta.qmean_norm = swissmodels_fasta.qmean_norm.astype(float)
-
-            swissmodels_fasta = swissmodels_fasta.sort_values(['uniprotID', 'template', 'qmean_norm', 'chain'],
-                                                              axis=0)  # example = 3gdh
-            swissmodels_fasta.reset_index(inplace=True)
-            swissmodels_fasta.drop(['index'], axis=1, inplace=True)
-            swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'qmean_norm', 'chain'])
-            swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'chain', 'fasta'])
-            swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'fasta'])
-            # Some files were broken, thus their PDBs couldnt be recorded.
-            swissmodels_fasta = swissmodels_fasta.drop_duplicates()
-            swissmodels_fasta = swissmodels_fasta.astype(str)
-
-            swiss_models_with_data = swiss_models_with_data.astype(str)
-            swissmodels_fasta = swissmodels_fasta.astype(str)
-            swiss_models_with_data1 = swiss_models_with_data.merge(swissmodels_fasta,
-                                                                   on=['uniprotID', 'template', 'qmean_norm', 'chain'])
-
-            swiss_models_with_data1 = swiss_models_with_data1.sort_values(['datapoint', 'fasta'], axis=0,
-                                                                          ascending=[True, False])
-            swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
-
-            swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
-            swiss_models_with_data.reset_index(inplace=True)
-            swiss_models_with_data.drop(['index'], axis=1, inplace=True)
-            broken_swiss = pd.DataFrame()
-            c = 0
-            for i in swiss_models_with_data.index:  # en baştaki dfde var ama model gelende yok.
-                if swiss_models_with_data.at[i, 'datapoint'] not in swiss_models_with_data1_dp:
-                    k = pd.Series(swiss_models_with_data.iloc[i])
-                    broken_swiss = broken_swiss.append(k, ignore_index=True)
-                    c += 1
-
-            if len(broken_swiss) == 0:
-                broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list())
-
-            swiss_models_with_data = swiss_models_with_data1.copy()
-
-            swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
-            swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
-                                                                        axis=0, ascending=[True, True, True, False])
-
-            # Delete the same model sequence with lower quality
-            swiss_models_with_data = swiss_models_with_data.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
-                                                                            keep='first')
-            swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
-            swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
-            len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(
-                broken_swiss.drop_duplicates(['datapoint'])) + len(
-                no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
-            # This printed data here includes all possible models with different qualities,
-            # because we may get a hit in either of them.
-            swiss_models_with_data.rename({'fasta': 'pdbSequence'}, axis=1, inplace=True)  # for convenience.
-
-            # NOW DO ALIGNMENT HERE
-
-            swiss_models_with_data = swiss_models_with_data.replace({'[\'?\']': 'nan'})
-            swiss_models_with_data = swiss_models_with_data.replace({'[]': 'nan'})
-            swiss_models_with_data.rename({'template': 'pdbID'}, axis=1,
-                                          inplace=True)  # Only to be able use the alignment code above.
-            swiss_models_with_data = swiss_models_with_data.astype(str)
-            swiss_models_with_data.pdbSequence = swiss_models_with_data.pdbSequence.astype('str')
-            swiss_models_with_data = add_annotations(swiss_models_with_data)
-            swiss_models_with_data = swiss_models_with_data.astype(str)
-            swiss_models_with_data.replace({'NaN': 'nan'}, inplace=True)
-            swiss_models_with_data_copy = swiss_models_with_data.copy()
-            swiss_models_with_data1_dp = None
-            swiss_models_with_data1 = None
-            existing_swiss = None
-            swissmodels_fasta = None
-
-            print('Aligning sequences...\n')
-
-            swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
-            swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
-            swiss_model_aligned = alignment(swiss_models_with_data, annotation_list,
-                                            path_to_output_files / 'alignment_files')
-            swiss_models_with_data = None
-
-            if len(swiss_model_aligned) == 0:
-                swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
-                swiss_model_aligned['qmean_norm'] = 'nan'
+                modbase_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'score', 'chain', 'fasta'])
+            modbase_fasta = modbase_fasta.astype(str)
+            modbase_fasta = modbase_fasta.replace({'': 'nan'})
+            modbase_fasta = modbase_fasta.replace({'NaN': 'nan'})
+            modbase_fasta = modbase_fasta[modbase_fasta.fasta != 'nan']
+
+            print('Modbase model frame constructed.\n')
+            if len(modbase_reduced) != 0:
+                modbase_reduced.columns = ['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin',
+                                           'PDBEnd',
+                                           'ModPipeQualityScore', 'ModelID']
             else:
-                swiss_model_aligned = swiss_model_aligned.astype(str)
-                swiss_model_aligned.replace({'NaN': 'nan'}, inplace=True)
+                modbase_reduced = pd.DataFrame(
+                    columns=['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', 'PDBEnd',
+                             'ModPipeQualityScore', 'ModelID'])
 
-            # Some datapoints appear in both nan and not_nan. If not_nan we take it only once.
-            nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB == 'nan']
-            not_nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB != 'nan']
-            not_nan.qmean_norm = not_nan.qmean_norm.astype('float')
-            not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'qmean_norm'], ascending=[True, True, False],
-                                inplace=True)
+            to_modbase = add_annotations(to_modbase)
 
-            which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
-            swiss_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan']
-            swiss_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan']
-
-            swiss_match.qmean_norm = swiss_match.qmean_norm.astype('float')
-            swiss_match.sort_values(['uniprotID', 'wt', 'pos', 'mut', 'pdb_alignStatus', 'qmean_norm'],
-                                    ascending=[True, True, True, True, True, False], inplace=True)
-            swiss_match.drop_duplicates(['uniprotID', 'wt', 'pos', 'mut'], keep='first', inplace=True)
-            swiss_not_match = swiss_not_match[no_swiss_models_2.columns]
-            broken_swiss = broken_swiss[no_swiss_models_2.columns]
-            swiss_not_match = swiss_not_match.drop_duplicates(['datapoint'])
-            broken_swiss = broken_swiss.drop_duplicates(['datapoint'])
-
-            to_modbase = pd.concat([no_swiss_models_2, broken_swiss]).drop_duplicates()
-            to_modbase = pd.concat([to_modbase, swiss_not_match]).drop_duplicates()
             to_modbase = to_modbase.astype(str)
-            to_swiss_columns = to_swiss.columns
-            to_swiss_size = len(to_swiss.drop_duplicates(['datapoint']))
-            to_swiss = None
-
-            # CONTROL
-
-            """
-            # This should be the whole data.
-            len(swiss_match.drop_duplicates(['datapoint'])) + len(aligned.drop_duplicates(['datapoint'])) + len(to_modbase.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) ,len(data)
-            len(aligned.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) +len(to_swiss.drop_duplicates(['datapoint']))== len(data)
-            """
-            print('SwissModel matching is completed...\n')
-            print('SUMMARY')
-            print('-------')
-            print('%d data points that failed to match a UniProt Sequence are discarded.' % len(
-                not_match_in_uniprot.drop_duplicates(['datapoint'])))
-            print('Of the remaining %d:' % uniprot_matched_size)
-            print('--%d of %d successfully aligned with PDB structures.' % (
-                len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size))
-            print('--%d of %d successfully aligned with SwissModels structures.' % (
-                len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size))
-            print('--%d will be searched in ModBase database.\n' % len(to_modbase.drop_duplicates(['datapoint'])))
-
-            print('Proceeding to ModBase search...')
-            print('------------------------------------\n')
-            no_swiss_models_2 = None
-            broken_swiss = None
-            swiss_model_aligned = None
-            nan = None
-            not_nan = None
-            which_ones_are_match = None
-            swiss_not_match = None
-
-            # STEP :  GO TO MODBASE
-            # Should not include anything related to prev models.
-            if len(to_modbase) != 0:
-                to_modbase = to_modbase.astype(str)
-
-                # GET MODBASE MODELS
-
-                # Get IDs from data to retrieve only their models from MODBASE
-                to_modbase.reset_index(inplace=True)
-                to_modbase.drop(['index'], axis=1, inplace=True)
-
-                existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*"))
-                existing_modbase_models = [str(i) for i in existing_modbase_models]
-                existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models]
-
-                existing_modbase_models_ind = list(
-                    Path(path_to_output_files / 'modbase_structures_individual').glob("*"))
-                existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind]
-                existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind]
-
-                modbase_reduced = pd.DataFrame()
-                modbase_fasta = pd.DataFrame()
-
-                print('Retrieving ModBase models...\n')
-                # Get model files associated with each UniProtID
-                for protein in list(set(to_modbase.uniprotID.to_list())):
-                    if protein not in existing_modbase_models:
-                        print('Downloading Modbase models for ', protein)
-                        url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
-                        print(url)
-                        req = requests.get(url)
-                        name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
-                        with open(name, 'wb') as f:
-                            f.write(req.content)
-                    else:
-                        print('Model exists for', protein)
-                        name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt')
-                    with open(name, encoding="utf8") as f:
-                        a = open(name, 'r').read()
-                        soup = BeautifulSoup(a, 'lxml')
-                        for pdb in soup.findAll('pdbfile'):
-                            model_id = str(pdb.contents[1])[10:-11]
-                            if model_id not in existing_modbase_models_ind:
-                                with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt',
-                                          'w',
-                                          encoding="utf8") as individual:
-                                    individual.write(str('UniProt ID: ' + protein))
-                                    individual.write('\n')
-                                    individual.write(str(pdb.contents[3])[10:-11].strip())
-                            with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt',
-                                      encoding="utf8") as f:
-                                fasta = ''
-                                chain = ''
-                                template_chain = ''
-                                score = -999
-                                for ind_line in f.readlines():
-                                    if ind_line[0:10] == 'UniProt ID':
-                                        uniprot_id = ind_line.split(':')[1].strip()
-                                    if ind_line[0:23] == 'REMARK 220 TARGET BEGIN':
-                                        target_begin = ind_line[40:43].strip()
-                                    if ind_line[0:21] == 'REMARK 220 TARGET END':
-                                        target_end = ind_line[40:43].strip()
-                                    if ind_line[0:25] == 'REMARK 220 TEMPLATE BEGIN':
-                                        pdb_begin = ind_line[40:43].strip()
-                                    if ind_line[0:23] == 'REMARK 220 TEMPLATE END':
-                                        pdb_end = ind_line[40:43].strip()
-                                    if ind_line[0:23] == 'REMARK 220 TEMPLATE PDB':
-                                        pdb_code = ind_line[40:43].strip()
-                                    if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN':
-                                        pdb_chain = ind_line[40:43].strip()
-                                    if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score':
-                                        quality_score = ind_line[40:].strip()
-                                    if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID':
-                                        model_id = ind_line[40:].strip()
-                                    if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN':
-                                        template_chain = ind_line[40:42].strip()
-                                    if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA':
-                                        fasta += threeToOne(ind_line[17:20])
-                                    if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score':
-                                        try:
-                                            score = ind_line[40:].strip()
-                                        except (ValueError):
-                                            score = -999
-                                    if ind_line[0:3] == 'TER' or ind_line[0:3] == 'END':
-                                        k = pd.Series([uniprot_id, model_id, str(score), template_chain, fasta])
-                                        modbase_fasta = modbase_fasta.append(k, ignore_index=True)
-                                        fasta = ''
-                                try:
-                                    k = pd.Series(
-                                        [uniprot_id, target_begin, target_end, pdb_code, pdb_chain, pdb_begin, pdb_end,
-                                         quality_score,
-                                         model_id])
-                                    modbase_reduced = modbase_reduced.append(k, ignore_index=True)
-                                except:
-                                    NameError
-                                    print('This file doesnt have Quality Score. Replacer: -999', model_id)
-                                    quality_score = -999
-
-                print()
-                if len(modbase_fasta) != 0:
-                    modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta']
-                else:
-                    modbase_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'score', 'chain', 'fasta'])
-                modbase_fasta = modbase_fasta.astype(str)
-                modbase_fasta = modbase_fasta.replace({'': 'nan'})
-                modbase_fasta = modbase_fasta.replace({'NaN': 'nan'})
-                modbase_fasta = modbase_fasta[modbase_fasta.fasta != 'nan']
-
-                print('Modbase model frame constructed.\n')
-                if len(modbase_reduced) != 0:
-                    modbase_reduced.columns = ['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin',
-                                               'PDBEnd',
-                                               'ModPipeQualityScore', 'ModelID']
-                else:
-                    modbase_reduced = pd.DataFrame(
-                        columns=['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', 'PDBEnd',
-                                 'ModPipeQualityScore', 'ModelID'])
-
-                to_modbase = add_annotations(to_modbase)
-
-                to_modbase = to_modbase.astype(str)
-                to_modbase.fillna('nan', inplace=True)
-                to_modbase = to_modbase.replace({'NaN': 'nan'})
-                to_modbase.replace({'[]': 'nan'}, inplace=True)
-                to_modbase.replace({'nan-nan': 'nan'}, inplace=True)
-                to_modbase.replace({'': 'nan'}, inplace=True)
-                model_info_added = to_modbase.merge(modbase_reduced, right_on='UniprotID', left_on='uniprotID',
-                                                    how='left')
-                modbase_reduced = None
-                existing_modbase_models = None
-                existing_modbase_models_ind = None
-
-                model_info_added = model_info_added.drop(['UniprotID'], axis=1)
-                model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
-                                                                    'PDBCode': 'template', 'PDBChain': 'chain',
-                                                                    'ModPipeQualityScore': 'score',
-                                                                    'ModelID': 'pdbID'})
-                model_info_added.drop(['PDBEnd', 'PDBBegin'], axis=1, inplace=True)
-                model_info_added.score = model_info_added.score.astype(float)
-                model_info_added = model_info_added.sort_values(by=['datapoint', 'score'],
-                                                                ascending=False)
-                model_info_added.reset_index(inplace=True)
-                model_info_added.drop(['index'], axis=1, inplace=True)
-                model_info_added = model_info_added.drop_duplicates()
-
-                model_info_added = model_info_added.astype(str)
-                model_info_added = model_info_added.replace({'NaN': 'nan'})
-                no_info = model_info_added[model_info_added.pdbID == 'nan']
-                with_modbase_info = model_info_added[model_info_added.pdbID != 'nan']
-                model_info_added = None
-
-                len(no_info.drop_duplicates(['datapoint'])), len(with_modbase_info.drop_duplicates(['datapoint']))
-                len(no_info.drop_duplicates(['datapoint'])) + len(
-                    with_modbase_info.drop_duplicates(['datapoint'])) == len(
-                    to_modbase.drop_duplicates(['datapoint']))
-
-                # Add no_info to the rest down below!
-                no_info = no_info[to_swiss_columns]
-
-                with_modbase_info.score = with_modbase_info.score.astype(float)
-                modbase_fasta.score = modbase_fasta.score.astype(float)
-
-                modbase_fasta = modbase_fasta.sort_values(['uniprotID', 'score', 'template', 'chain'],
-                                                          ascending=[True, False, True, True], axis=0)  # example = 3gdh
-
-                # I added this newly downloaded ones to the main model file.
-
-                modbase_fasta = modbase_fasta.rename(columns={'template': 'pdbID'})
-                with_modbase_info.pos = with_modbase_info.pos.astype('int')
-                with_modbase_info.score = with_modbase_info.score.astype(float)
-                with_modbase_info.score = with_modbase_info.score.apply(lambda x: round(x, 2))
-                modbase_fasta.score = modbase_fasta.score.astype(float)
-                modbase_fasta.score = modbase_fasta.score.apply(lambda x: round(x, 2))
-
-                with_modbase_info = with_modbase_info.merge(modbase_fasta, on='pdbID', how='left')
-
-                with_modbase_info.drop(['score_y'], axis=1, inplace=True)
-                with_modbase_info.rename(columns={'score_x': 'score'}, inplace=True)
-                with_modbase_info.drop(['uniprotID_y', 'chain_y'], axis=1, inplace=True)
-                with_modbase_info.rename(columns={'uniprotID_x': 'uniprotID', 'chain_x': 'chain'}, inplace=True)
-
-                with_modbase_info.score = with_modbase_info.score.astype('float')
-                with_modbase_info = with_modbase_info.sort_values(
-                    ['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
-                    axis=0,
-                    ascending=[True, True, True, True, False, True, False])
-                with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
-                                                                      keep='first')
-
-                with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
-                with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
-                with_modbase_info = with_modbase_info.replace({'\'?\', ': ''})
-                with_modbase_info = with_modbase_info.replace({', \'?\'': ''})
-                with_modbase_info = with_modbase_info.replace({'(': ''})
-                with_modbase_info = with_modbase_info.replace(
-                    {')': ''})
-                with_modbase_info = with_modbase_info.astype(str)
-                with_modbase_info.fasta = with_modbase_info.fasta.astype('str')
-                with_modbase_info.reset_index(inplace=True)
-                with_modbase_info.drop('index', axis=1, inplace=True)
-
-                align = with_modbase_info[
-                    with_modbase_info.fasta != 'nan']
-                yes_pdb_no_match = with_modbase_info[
-                    with_modbase_info.fasta == 'nan']
-                yes_pdb_no_match = yes_pdb_no_match[~yes_pdb_no_match.datapoint.isin(align.datapoint.to_list())]
-
-                align.rename(columns={'fasta': 'pdbSequence'}, inplace=True)
-                align['uniprotSequence'] = align['uniprotSequence'].str.replace('U', 'C')
-                align['pdbSequence'] = align['pdbSequence'].str.replace('U', 'C')
-
-                to_modbase_size = len(to_modbase.drop_duplicates(['datapoint']))
-                modbase_fasta = None
-                to_modbase = None
-                print('Aligning sequences...\n')
-                modbase_aligned = alignment(align, annotation_list, path_to_output_files / 'alignment_files')
-                modbase_aligned = modbase_aligned.astype(str)
-                modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
-
-                # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
-                if len(with_modbase_info) != 0:
-                    not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
-                                                with_modbase_info.drop_duplicates(['datapoint'])]).drop_duplicates(
-                        ['datapoint'],
-                        keep=False)
-                else:
-                    not_in_aligned = pd.DataFrame(
-                        columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                                 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
-                                 'intMet',
-                                 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                                 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
-                                 'crosslink',
-                                 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
-                                 'coiledCoil',
-                                 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
-                                 'disulfide',
-                                 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
-                                 'activeSite',
-                                 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
-                                 'crosslink',
-                                 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
-                                 'coiledCoil',
-                                 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
-                                 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
-                with_modbase_info = None
-                if len(not_in_aligned) != 0:
-                    not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
-                                            not_in_aligned.drop_duplicates(['datapoint'])]).drop_duplicates(
-                        ['datapoint'],
-                        keep='first')
-                # Retain the best model among the aligned ones.
-                else:
-                    not_models = pd.DataFrame(columns=not_in_aligned.columns)
-
-                yes_pdb_no_match = None
-                # # Some datapoints appear in both nan and not_nan. If not_nan we take it only once.
-                modbase_aligned = modbase_aligned.astype(str)
-                if len(modbase_aligned) != 0:
-                    nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
-                    not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
-                    not_nan.score = not_nan.score.astype(float)
-                    not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False],
-                                        inplace=True)
-
-                    not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
-                                                  ascending=[True, True, False])
-                    not_nan = not_nan.drop_duplicates(['datapoint'], keep='first')
-                else:
-                    nan = pd.DataFrame(columns=modbase_aligned.columns)
-                    not_nan = pd.DataFrame(columns=modbase_aligned.columns)
-                modbase_aligned = None
-                which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
-                if len(which_ones_are_match) == 0:
-                    which_ones_are_match = pd.DataFrame(
-                        columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                                 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
-                                 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                                 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
-                                 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                                 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
-                                 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
-                                 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
-                                 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
-                                 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
-                                 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
-                                 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
-                                 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
-                                 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
-                                 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
-                                 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
-                                 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
-                                 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
-                    modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan']
-                    modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan']
-
-                else:
-                    modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan']
-                    modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan']
-
-                which_ones_are_match = None
-                modbase_match.score = modbase_match.score.astype('float')
-                modbase_match = modbase_match.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
-                                                          ascending=[True, True, False])
-                modbase_match.drop_duplicates(['datapoint'], keep='first', inplace=True)
-                not_nan = None
-                nan = None
-
-                # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
-
-                # No model
-                no_info = no_info[to_swiss_columns]
-                no_info = no_info.drop_duplicates()
-
-                # Model present, no sequence
-                not_models = not_models[to_swiss_columns]
-                not_models = not_models.drop_duplicates()
-
-                # Modbase model and sequence present, no match in PDB
-                modbase_not_match = modbase_not_match[to_swiss_columns]
-                modbase_not_match = modbase_not_match.drop_duplicates()
-                if len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) != 0:
-                    rest = pd.concat([not_in_aligned, modbase_not_match, no_info])
-                elif len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) == 0:
-                    rest = pd.concat([not_in_aligned, modbase_not_match])
-                elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) != 0:
-                    rest = pd.concat([modbase_not_match, no_info])
-                elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
-                    rest = pd.concat([not_in_aligned, no_info])
-                elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) == 0:
-                    rest = not_in_aligned
-                elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) == 0:
-                    rest = modbase_not_match
-                elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
-                    rest = no_info
-                else:
-                    rest = pd.DataFrame(
-                        columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                                 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                 'wt_sequence_match', 'whichIsoform', 'datapoint'])
-
-                rest = rest[to_swiss_columns]
-                rest = rest.drop_duplicates()
-
-                rest.reset_index(inplace=True)
-                rest.drop(['index'], axis=1, inplace=True)
-                rest = rest.astype('str')
+            to_modbase.fillna('nan', inplace=True)
+            to_modbase = to_modbase.replace({'NaN': 'nan'})
+            to_modbase.replace({'[]': 'nan'}, inplace=True)
+            to_modbase.replace({'nan-nan': 'nan'}, inplace=True)
+            to_modbase.replace({'': 'nan'}, inplace=True)
+            model_info_added = to_modbase.merge(modbase_reduced, right_on='UniprotID', left_on='uniprotID',
+                                                how='left')
+            modbase_reduced = None
+            existing_modbase_models = None
+            existing_modbase_models_ind = None
+
+
+            model_info_added = model_info_added.drop(['UniprotID'], axis=1)
+            model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
+                                                                'PDBCode': 'template', 'PDBChain': 'chain',
+                                                                'ModPipeQualityScore': 'score',
+                                                                'ModelID': 'pdbID'})
+            model_info_added.drop(['PDBEnd', 'PDBBegin'], axis=1, inplace=True)
+            model_info_added.score = model_info_added.score.astype(float)
+            model_info_added = model_info_added.sort_values(by=['datapoint', 'score'],
+                                                            ascending=False)
+            model_info_added.reset_index(inplace=True)
+            model_info_added.drop(['index'], axis=1, inplace=True)
+            model_info_added = model_info_added.drop_duplicates()
+
+            model_info_added = model_info_added.astype(str)
+            model_info_added = model_info_added.replace({'NaN': 'nan'})
+            no_info = model_info_added[model_info_added.pdbID == 'nan']
+            with_modbase_info = model_info_added[model_info_added.pdbID != 'nan']
+            model_info_added = None
+
+            len(no_info.drop_duplicates(['datapoint'])), len(with_modbase_info.drop_duplicates(['datapoint']))
+            len(no_info.drop_duplicates(['datapoint'])) + len(with_modbase_info.drop_duplicates(['datapoint'])) == len(
+                to_modbase.drop_duplicates(['datapoint']))
+
+            # Add no_info to the rest down below!
+            no_info = no_info[to_swiss_columns]
+
+            with_modbase_info.score = with_modbase_info.score.astype(float)
+            modbase_fasta.score = modbase_fasta.score.astype(float)
+
+            modbase_fasta = modbase_fasta.sort_values(['uniprotID', 'score', 'template', 'chain'],
+                                                      ascending=[True, False, True, True], axis=0)  # example = 3gdh
+
+            # I added this newly downloaded ones to the main model file.
+
+            modbase_fasta = modbase_fasta.rename(columns={'template': 'pdbID'})
+            with_modbase_info.pos = with_modbase_info.pos.astype('int')
+            with_modbase_info.score = with_modbase_info.score.astype(float)
+            with_modbase_info.score = with_modbase_info.score.apply(lambda x: round(x, 2))
+            modbase_fasta.score = modbase_fasta.score.astype(float)
+            modbase_fasta.score = modbase_fasta.score.apply(lambda x: round(x, 2))
+
+            with_modbase_info = with_modbase_info.merge(modbase_fasta, on='pdbID', how='left')
+
+            with_modbase_info.drop(['score_y'], axis=1, inplace=True)
+            with_modbase_info.rename(columns={'score_x': 'score'}, inplace=True)
+            with_modbase_info.drop(['uniprotID_y', 'chain_y'], axis=1, inplace=True)
+            with_modbase_info.rename(columns={'uniprotID_x': 'uniprotID', 'chain_x': 'chain'}, inplace=True)
+
+            with_modbase_info.score = with_modbase_info.score.astype('float')
+            with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
+                                                              axis=0,
+                                                              ascending=[True, True, True, True, False, True, False])
+            with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], keep='first')
+
+            with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
+            with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
+            with_modbase_info = with_modbase_info.replace({'\'?\', ': ''})
+            with_modbase_info = with_modbase_info.replace({', \'?\'': ''})
+            with_modbase_info = with_modbase_info.replace({'(': ''})
+            with_modbase_info = with_modbase_info.replace(
+                {')': ''})
+            with_modbase_info = with_modbase_info.astype(str)
+            with_modbase_info.fasta = with_modbase_info.fasta.astype('str')
+            with_modbase_info.reset_index(inplace=True)
+            with_modbase_info.drop('index', axis=1, inplace=True)
+
+
+            align = with_modbase_info[
+                with_modbase_info.fasta != 'nan']
+            yes_pdb_no_match = with_modbase_info[
+                with_modbase_info.fasta == 'nan']
+            yes_pdb_no_match = yes_pdb_no_match[~yes_pdb_no_match.datapoint.isin(align.datapoint.to_list())]
+
+            align.rename(columns={'fasta': 'pdbSequence'}, inplace=True)
+            align['uniprotSequence'] = align['uniprotSequence'].str.replace('U', 'C')
+            align['pdbSequence'] = align['pdbSequence'].str.replace('U', 'C')
+
+            to_modbase_size = len(to_modbase.drop_duplicates(['datapoint']))
+            modbase_fasta = None
+            to_modbase = None
+            print('Aligning sequences...\n')
+            modbase_aligned = alignment(align, annotation_list, path_to_output_files / 'alignment_files')
+            modbase_aligned = modbase_aligned.astype(str)
+            modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
 
 
+            # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
+            if len(with_modbase_info) != 0:
+                not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
+                                            with_modbase_info.drop_duplicates(['datapoint'])]).drop_duplicates(
+                    ['datapoint'],
+                    keep=False)
             else:
-
-                modbase_match = pd.DataFrame(
-                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
+                                                       'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                                                       'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
+                                                       'intMet',
+                                                       'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                                                       'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
+                                                       'crosslink',
+                                                       'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                                                       'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                                                       'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
+                                                       'coiledCoil',
+                                                       'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                                                       'disulfide',
+                                                       'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
+                                                       'activeSite',
+                                                       'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
+                                                       'crosslink',
+                                                       'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                                                       'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                                                       'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
+                                                       'coiledCoil',
+                                                       'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
+                                                       'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
+            with_modbase_info = None
+            if len(not_in_aligned) != 0:
+                not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
+                                        not_in_aligned.drop_duplicates(['datapoint'])]).drop_duplicates(['datapoint'],
+                                                                                                        keep='first')
+            # Retain the best model among the aligned ones.
+            else:
+                not_models = pd.DataFrame(columns=not_in_aligned.columns)
+
+            yes_pdb_no_match = None
+            # # Some datapoints appear in both nan and not_nan. If not_nan we take it only once.
+            modbase_aligned = modbase_aligned.astype(str)
+            if len(modbase_aligned) != 0:
+                nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
+                not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
+                not_nan.score = not_nan.score.astype(float)
+                not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], inplace=True)
+
+                not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
+                                              ascending=[True, True, False])
+                not_nan = not_nan.drop_duplicates(['datapoint'], keep='first')
+            else:
+                nan = pd.DataFrame(columns=modbase_aligned.columns)
+                not_nan = pd.DataFrame(columns=modbase_aligned.columns)
+            modbase_aligned = None
+            which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
+            if len(which_ones_are_match) == 0:
+                which_ones_are_match = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
                              'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
                              'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
                              'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
@@ -1240,483 +1118,547 @@ def pdb(input_set, mode, impute):
                              'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
                              'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
                              'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
-                not_in_aligned = pd.DataFrame(
-                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                             'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
-                             'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                             'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
-                             'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                             'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                             'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                             'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
-                             'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                             'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
-                             'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                             'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                             'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                             'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
-                             'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
-                no_info = pd.DataFrame(
-                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                             'wt_sequence_match', 'whichIsoform', 'datapoint'])
-                rest = pd.DataFrame(
-                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                             'wt_sequence_match', 'whichIsoform', 'datapoint'])
-
-                rest = rest[to_swiss_columns]
-                rest = rest.drop_duplicates()
-
-                rest.reset_index(inplace=True)
-                rest.drop(['index'], axis=1, inplace=True)
-                rest = rest.astype('str')
-                to_modbase_size = 0
-
-            print('Modbase matching is completed...\n')
-            print('SUMMARY')
-            print('-------')
-            print('%d data points that failed to match a UniProt Sequence are discarded.' % len(
-                not_match_in_uniprot.drop_duplicates(['datapoint'])))
-            print('Of the remaining %d:' % uniprot_matched_size)
-            print('--%d of %d successfully aligned with PDB structures.' % (
-                len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size))
-            print('--%d of %d successfully aligned with SwissModels structures.' % (
-                len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size))
-            print('--%d of %d successfully aligned with Modbase structures.\n' % (
-                len(modbase_match.drop_duplicates(['datapoint'])), to_modbase_size))
-            print('--Remaining %d not found to match any models.' % len(rest.drop_duplicates(['datapoint'])))
-            print('--A total of %d datapoints will not be evaluated.\n' % (
-                    len(rest.drop_duplicates(['datapoint'])) + len(
-                not_match_in_uniprot.drop_duplicates(['datapoint']))))
-
-            print('FOR CHECKING : ',
-                  len(rest.drop_duplicates(['datapoint'])) + len(
-                      not_match_in_uniprot.drop_duplicates(['datapoint'])) + len(
-                      pdb_aligned.drop_duplicates(['datapoint'])) + len(
-                      swiss_match.drop_duplicates(['datapoint'])) + len(
-                      modbase_match.drop_duplicates(['datapoint'])) == data_size)
-            no_info = None
-            align = None
-            not_in_aligned = None
-            not_models = None
-            modbase_not_match = None
-
-            # Final corrections
-
-            # Now 3D alignment.
-            pdb = pdb_aligned.copy()
-            swiss = swiss_match.copy()
-            modbase = modbase_match.copy()
-
-            pdb_aligned = None
-            swiss_match = None
-            modbase_match = None
-
-            """
-            WHAT DO WE HAVE NOW?
-            - uniprot sequence not found
-            - pdb aligned
-            - swiss aligned
-            - modbase aligned
-            - not aligned with anything (rest)
-            """
-
-            # Fix the axes and  merge all data.
-
-            pdb.drop(['pdbInfo'], axis=1, inplace=True)
-            pdb.rename(columns={'resolution': 'score'}, inplace=True)
-            swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
-            modbase.rename(columns={'qmean_norm': 'score'}, inplace=True)
-
-            swiss = swiss[pdb.columns]
-            modbase = modbase[pdb.columns]
-            pdb['source'] = 'PDB'
-            swiss['source'] = 'SWISSMODEL'
-            modbase['source'] = 'MODBASE'
-            data = pd.concat([swiss, modbase, pdb])
-
-            data.reset_index(inplace=True)
-            data.drop(['index'], axis=1, inplace=True)
-            data = data.astype('str')
-            data_spare = pd.concat([not_match_in_uniprot, rest])
-            not_match_in_uniprot = None
-            pdb = None
-            swiss = None
-            modbase = None
-            rest = None
-
-            print('Generating FreeSASA files...')
-            print('------------------------------------\n')
-            # Folder to calculated RSA values.
-
-            existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
-
-            existing_free_sasa = [str(i) for i in existing_free_sasa]
-            existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
-
-            print('Calculation RSA for PDB Structure Files...\n')
-
-            pdb_only = data[data.source == 'PDB']
-            for pdbID in pdb_only.pdbID.to_list():
-                if pdbID not in existing_free_sasa:
-                    (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
-                                  Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
-                                  include_hetatms=True,
-                                  outdir=None, force_rerun=False, file_type='pdb'))
-
-            print('Calculation RSA for SwissModel Files...\n')
-            swiss_only = data[data.source == 'SWISSMODEL']
-            swiss_dp = []
-            for i in swiss_only.index:
-                swiss_dp.append(swiss_only.at[i, 'uniprotID'] + '_' + swiss_only.at[i, 'pdbID'].lower() + '_' + str(
-                    round(float(swiss_only.at[i, 'score']), 2)))
-            for pdbID in swiss_dp:
-                if pdbID not in existing_free_sasa:
-                    (run_freesasa(Path(path_to_output_files / 'swissmodel_structures' / f'{pdbID}.txt'),
-                                  Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True,
-                                  outdir=None, force_rerun=False, file_type='pdb'))
-
-            print('Calculation RSA for Modbase Model Files...\n')
-            modbase_only = data[data.source == 'MODBASE']
-            for pdbID in modbase_only.pdbID.to_list():
-                if pdbID not in existing_free_sasa:
-                    (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
-                                  Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
-                                  include_hetatms=True,
-                                  outdir=None, force_rerun=False, file_type='pdb'))
-
-            # This annotation list is different than the prev one, keep it.
-
-            annotation_list += ['domainStartonPDB', 'domainEndonPDB']
-
-            folder_path = path_to_output_files / 'freesasa_files'
-
-            aligner = Align.PairwiseAligner()
-            print('Proceeding to 3D distance calculation...\n')
-
-            data.domainEndonPDB = data.domainEndonPDB.astype(str)
-            data.domainStartonPDB = data.domainStartonPDB.astype(str)
-
-            existing_free_sasa = None
-            swiss_dp = None
-            pdb_only = None
-            swiss_only = None
-            modbase_only = None
-            data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C')
-            data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C')
-            for i in data.index:
-                id_ = data.at[i, 'pdbID'].lower()
-                up_id_ = data.at[i, 'uniprotID']
-                score_ = str(data.at[i, 'score'])
-                if data.at[i, 'source'] == 'PDB':
-                    pdb_path = Path(path_to_output_files / 'pdb_structures' / f'{id_}.pdb')
-                elif data.at[i, 'source'] == 'MODBASE':
-                    pdb_path = Path(path_to_output_files / 'modbase_structures_individual' / f'{id_}.txt')
-                elif data.at[i, 'source'] == 'SWISSMODEL':
-                    pdb_path = Path(path_to_output_files / 'swissmodel_structures' / f'{up_id_}_{id_}_{score_}.txt')
-
-                pdbSequence = data.at[i, 'pdbSequence']
-                source = data.at[i, 'source']
-                chain = data.at[i, 'chain']
-                uniprotID = data.at[i, 'uniprotID']
-                pdbID = data.at[i, 'pdbID']
-                alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode,
-                                               Path(path_to_output_files / '3D_alignment'), file_format='gzip')
-                mutPos = data.at[i, 'mutationPositionOnPDB']
-                try:
-                    coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
-                except:
-                    ValueError
-                    coordMut = 'nan'
-                try:
-                    sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
-                    data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'],
-                                              sasa_pos, data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb')
-                except:
-                    ValueError
-                    data.at[i, 'sasa'] = 'nan'  # mutation position is nan
-                for annot in annotation_list:
-                    annotx = []
-                    try:
-                        positions_of_annotations = data.at[i, annot].split(',')
-                        for pos in positions_of_annotations:
-                            pos = pos.strip().strip('\'').strip('[\'').strip('\']')
-                            try:
-                                if '-' not in pos:
-                                    pos = int(float(pos))
-                                    coordAnnot = get_coords(pos, alignments, 'nan', 'nan', mode)[0]
-                                    try:
-                                        annotx.append(find_distance(coordMut, coordAnnot))
-                                    except:
-                                        ValueError
-
-                                else:
-                                    for r in range(int(pos.split('-')[0]), int(pos.split('-')[1]) + 1):
-                                        coordAnnot = get_coords(r, alignments, 'nan', 'nan', mode)[0]
-                                        annotx.append(find_distance(coordMut, coordAnnot))
-                            except:
-                                ValueError
-                        try:
-                            data.at[i, annot] = min([float(i) for i in annotx])
-                        except:
-                            ValueError
-                            data.at[i, annot] = 'nan'
+                modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan']
+                modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan']
 
-                    except:
-                        ValueError
+            else:
+                modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan']
+                modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan']
 
-                if (str(data.at[i, 'domainStartonPDB']) == 'NaN' or str(data.at[i, 'domainStartonPDB']) == 'nan') and (
-                        str(data.at[i, 'domainEndonPDB']) != 'NaN' and str(data.at[i, 'domainEndonPDB']) != 'nan'):
-                    data.at[i, 'domainStartonPDB'] = 100000
-                elif (str(data.at[i, 'domainEndonPDB']) == 'NaN' or str(data.at[i, 'domainEndonPDB']) == 'nan') and (
-                        str(data.at[i, 'domainStartonPDB']) != 'NaN' and str(data.at[i, 'domainStartonPDB']) != 'nan'):
-                    data.at[i, 'domainEndonPDB'] = 100000
-                elif (str(data.at[i, 'domainStartonPDB']) == 'NaN' and str(data.at[i, 'domainEndonPDB']) == 'nan'):
-                    data.at[i, 'domaindistance3D'] = 'nan'
+            which_ones_are_match = None
+            modbase_match.score = modbase_match.score.astype('float')
+            modbase_match = modbase_match.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
+                                                      ascending=[True, True, False])
+            modbase_match.drop_duplicates(['datapoint'], keep='first', inplace=True)
+            not_nan = None
+            nan = None
 
-                data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
-                                                     float(data.at[i, 'domainEndonPDB']))
-                data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
-                                                     float(data.at[i, 'domainEndonPDB']))
 
-            data = data.astype(str)
-            data.replace({'NaN': 'nan'}, inplace=True)
+            # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
+
+            # No model
+            no_info = no_info[to_swiss_columns]
+            no_info = no_info.drop_duplicates()
+
+            # Model present, no sequence
+            not_models = not_models[to_swiss_columns]
+            not_models = not_models.drop_duplicates()
+
+            # Modbase model and sequence present, no match in PDB
+            modbase_not_match = modbase_not_match[to_swiss_columns]
+            modbase_not_match = modbase_not_match.drop_duplicates()
+            if len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) != 0:
+                rest = pd.concat([not_in_aligned, modbase_not_match, no_info])
+            elif len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) == 0:
+                rest = pd.concat([not_in_aligned, modbase_not_match])
+            elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) != 0:
+                rest = pd.concat([modbase_not_match, no_info])
+            elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
+                rest = pd.concat([not_in_aligned, no_info])
+            elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) == 0:
+                rest = not_in_aligned
+            elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) == 0:
+                rest = modbase_not_match
+            elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
+                rest = no_info
+            else:
+                rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
+                                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                                             'wt_sequence_match', 'whichIsoform', 'datapoint'])
 
-            # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
+            rest = rest[to_swiss_columns]
+            rest = rest.drop_duplicates()
 
-            # Get interface positions from ECLAIR. Download HQ human
-            print()
-            print('Assigning surface regions...')
-            print('------------------------------------\n')
+            rest.reset_index(inplace=True)
+            rest.drop(['index'], axis=1, inplace=True)
+            rest = rest.astype('str')
 
-            print('Extracting interface residues...\n')
-            data_interface = pd.read_csv(path_to_interfaces, sep='\t')
 
-            positions = get_interface_positions(data_interface, 'P1', 'P2')
+        else:
 
-            interface_dataframe = pd.DataFrame()
+            modbase_match = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
+                                                  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                                                  'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
+                                                  'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                                                  'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                                                  'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                                                  'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                                                  'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                                                  'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                                                  'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
+                                                  'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                                                  'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                                                  'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                                                  'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                                                  'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                                                  'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                                                  'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                                                  'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                                                  'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
+                                                  'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
+                                                  'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
+            not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                                                   'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                                                   'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
+                                                   'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                                                   'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                                                   'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                                                   'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                                                   'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                                                   'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
+                                                   'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                                                   'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                                                   'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                                                   'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                                                   'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                                                   'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
+                                                   'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
+            no_info = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
+                                            'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                                            'wt_sequence_match', 'whichIsoform', 'datapoint'])
+            rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                                         'wt_sequence_match', 'whichIsoform', 'datapoint'])
+
+            rest = rest[to_swiss_columns]
+            rest = rest.drop_duplicates()
+
+            rest.reset_index(inplace=True)
+            rest.drop(['index'], axis=1, inplace=True)
+            rest = rest.astype('str')
+            to_modbase_size = 0
+
+        print('Modbase matching is completed...\n')
+        print('SUMMARY')
+        print('-------')
+        print('%d data points that failed to match a UniProt Sequence are discarded.' % len(
+            not_match_in_uniprot.drop_duplicates(['datapoint'])))
+        print('Of the remaining %d:' % uniprot_matched_size)
+        print('--%d of %d successfully aligned with PDB structures.' % (
+            len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size))
+        print('--%d of %d successfully aligned with SwissModels structures.' % (
+            len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size))
+        print('--%d of %d successfully aligned with Modbase structures.\n' % (
+            len(modbase_match.drop_duplicates(['datapoint'])), to_modbase_size))
+        print('--Remaining %d not found to match any models.' % len(rest.drop_duplicates(['datapoint'])))
+        print('--A total of %d datapoints will not be evaluated.\n' % (
+                len(rest.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint']))))
+
+        print('FOR CHECKING : ',
+              len(rest.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) + len(
+                  pdb_aligned.drop_duplicates(['datapoint'])) + len(swiss_match.drop_duplicates(['datapoint'])) + len(
+                  modbase_match.drop_duplicates(['datapoint'])) == data_size)
+        no_info = None
+        align = None
+        not_in_aligned = None
+        not_models = None
+        modbase_not_match = None
+
+
+        # Final corrections
+
+        # Now 3D alignment.
+        pdb = pdb_aligned.copy()
+        swiss = swiss_match.copy()
+        modbase = modbase_match.copy()
+        pdb_aligned = None
+        swiss_match = None
+        modbase_match = None
+
+        """
+        WHAT DO WE HAVE NOW?
+        - uniprot sequence not found
+        - pdb aligned
+        - swiss aligned
+        - modbase aligned
+        - not aligned with anything (rest)
+        """
+
+        # Fix the axes and  merge all data.
+
+
+        pdb.drop(['pdbInfo'], axis=1, inplace=True)
+        pdb.rename(columns={'resolution': 'score'}, inplace=True)
+        swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
+        modbase.rename(columns={'qmean_norm': 'score'}, inplace=True)
+
+        swiss = swiss[pdb.columns]
+        modbase = modbase[pdb.columns]
+        pdb['source'] = 'PDB'
+        swiss['source'] = 'SWISSMODEL'
+        modbase['source'] = 'MODBASE'
+        data = pd.concat([swiss, modbase, pdb])
+
+
+        data.reset_index(inplace=True)
+        data.drop(['index'], axis=1, inplace=True)
+        data = data.astype('str')
+        data_spare = pd.concat([not_match_in_uniprot, rest])
+        not_match_in_uniprot = None
+        pdb = None
+        swiss = None
+        modbase = None
+        rest = None
+
+        print('Generating FreeSASA files...')
+        print('------------------------------------\n')
+        # Folder to calculated RSA values.
+
+        existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
+        existing_free_sasa = [str(i) for i in existing_free_sasa]
+        existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
+
+        print('Calculation RSA for PDB Structure Files...\n')
+
+        pdb_only = data[data.source == 'PDB']
+        for pdbID in pdb_only.pdbID.to_list():
+            if pdbID not in existing_free_sasa:
+                (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
+                              Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
+                              outdir=None, force_rerun=False, file_type='pdb'))
+
+
+        print('Calculation RSA for SwissModel Files...\n')
+        swiss_only = data[data.source == 'SWISSMODEL']
+        swiss_dp = []
+        for i in swiss_only.index:
+            swiss_dp.append(swiss_only.at[i, 'uniprotID'] + '_' + swiss_only.at[i, 'pdbID'].lower() + '_' + str(
+                round(float(swiss_only.at[i, 'score']), 2)))
+        for pdbID in swiss_dp:
+            if pdbID not in existing_free_sasa:
+                (run_freesasa(Path(path_to_output_files / 'swissmodel_structures' / f'{pdbID}.txt'),
+                              Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True,
+                              outdir=None, force_rerun=False, file_type='pdb'))
+
+        print('Calculation RSA for Modbase Model Files...\n')
+        modbase_only = data[data.source == 'MODBASE']
+        for pdbID in modbase_only.pdbID.to_list():
+            if pdbID not in existing_free_sasa:
+                (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
+                              Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
+                              outdir=None, force_rerun=False, file_type='pdb'))
+
+        # This annotation list is different than the prev one, keep it.
+
+        annotation_list += ['domainStartonPDB', 'domainEndonPDB']
+
+        folder_path = path_to_output_files / 'freesasa_files'
+
+        aligner = Align.PairwiseAligner()
+        print('Proceeding to 3D distance calculation...\n')
+
+        data.domainEndonPDB = data.domainEndonPDB.astype(str)
+        data.domainStartonPDB = data.domainStartonPDB.astype(str)
+
+        existing_free_sasa = None
+        swiss_dp = None
+        pdb_only = None
+        swiss_only = None
+        modbase_only = None
+        data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C')
+        data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C')
+        for i in data.index:
+            id_ = data.at[i, 'pdbID'].lower()
+            up_id_ = data.at[i, 'uniprotID']
+            score_ = str(data.at[i, 'score'])
+            if data.at[i, 'source'] == 'PDB':
+                pdb_path = Path(path_to_output_files / 'pdb_structures' / f'{id_}.pdb')
+            elif data.at[i, 'source'] == 'MODBASE':
+                pdb_path = Path(path_to_output_files / 'modbase_structures_individual' / f'{id_}.txt')
+            elif data.at[i, 'source'] == 'SWISSMODEL':
+                pdb_path = Path(path_to_output_files / 'swissmodel_structures' / f'{up_id_}_{id_}_{score_}.txt')
+
+            pdbSequence = data.at[i, 'pdbSequence']
+            source = data.at[i, 'source']
+            chain = data.at[i, 'chain']
+            uniprotID = data.at[i, 'uniprotID']
+            pdbID = data.at[i, 'pdbID']
+            alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
+            mutPos = data.at[i, 'mutationPositionOnPDB']
+            try:
+                coordMut = get_coords(mutPos, alignments , 'nan', 'nan', mode)[0]
+            except:
+                ValueError
+                coordMut = 'nan'
+            try:
+                sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
+                data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos, data.at[i, 'wt'], mode, path_to_output_files,file_type = 'pdb')
+            except:
+                ValueError
+                data.at[i, 'sasa'] = 'nan'  # mutation position is nan
+            for annot in annotation_list:
+                annotx = []
+                try:
+                    positions_of_annotations = data.at[i, annot].split(',')
+                    for pos in positions_of_annotations:
+                        pos = pos.strip().strip('\'').strip('[\'').strip('\']')
+                        try:
+                            if '-' not in pos:
+                                pos = int(float(pos))
+                                coordAnnot = get_coords(pos, alignments, 'nan', 'nan', mode)[0]
+                                try:
+                                    annotx.append(find_distance(coordMut, coordAnnot))
+                                except:
+                                    ValueError
 
-            for key, val in positions.items():
-                k = pd.Series((key, str(list(set(val)))))
-                interface_dataframe = interface_dataframe.append(k, ignore_index=True)
-            interface_dataframe.columns = ['uniprotID', 'positions']
+                            else:
+                                for r in range(int(pos.split('-')[0]), int(pos.split('-')[1]) + 1):
+                                    coordAnnot = get_coords(r, alignments, 'nan', 'nan', mode)[0]
+                                    annotx.append(find_distance(coordMut, coordAnnot))
+                        except:
+                            ValueError
+                    try:
+                        data.at[i, annot] = min([float(i) for i in annotx])
+                    except:
+                        ValueError
+                        data.at[i, annot] = 'nan'
 
-            if len(data) == 0:
-                data = pd.DataFrame(
-                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                             'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
-                             'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
-                             'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
-                             'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
-                             'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                             'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                             'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                             'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
-                             'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
-                             'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
-                             'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
-                             'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
-                             'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
-                             'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
-                             'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
-                             'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
-                             'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
-                             'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
-                             'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
-                             'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
-            else:
-                data.sasa = data.sasa.astype('str')
-
-            for i in data.index:
-                if '*' in data.at[i, 'sasa']:
-                    data.at[i, 'sasa'] = data.at[i, 'sasa'].split('*')[0]
-
-            data.sasa = data.sasa.replace({'N/A': 'nan'})
-            data.sasa = data.sasa.replace({'None': 'nan'})
-            data.replace({'   N/A': 'nan'}, inplace=True)
-            data.replace({'None': 'nan'}, inplace=True)
-            data.sasa = data.sasa.astype(float)
-            data = data.astype(str)
-            for i in data.index:
-                if float(data.at[i, 'sasa']) < 5:
-                    data.at[i, 'trsh4'] = 'core'
-                elif float(data.at[i, 'sasa']) >= 5:
-                    data.at[i, 'trsh4'] = 'surface'
-                elif data.at[i, 'sasa'] == 'nan':
-                    data.at[i, 'trsh4'] = 'nan'
-
-            data = data.merge(interface_dataframe, on='uniprotID', how='left')
-            data.positions = data.positions.astype('str')
-            for i in data.index:
-                if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
-                    print((str(data.at[i, 'pos']) in data.at[i, 'positions']))
-                    data.at[i, 'threeState_trsh4_HQ'] = 'interface'
-                elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
-                    data.at[i, 'threeState_trsh4_HQ'] = 'surface'
-                elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core':
-                    data.at[i, 'threeState_trsh4_HQ'] = 'core'
-                elif (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core':
-                    data.at[i, 'threeState_trsh4_HQ'] = 'conflict'
-                elif data.at[i, 'trsh4'] == 'nan':
-                    data.at[i, 'threeState_trsh4_HQ'] = 'nan'
-
-            data.drop(['positions'], axis=1, inplace=True)
-
-            # OPTIONAL
-            # DOMAIN SELECTION
-            # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
-            # significant domains and 53th category will be NULL.
-
-            fisherResult = pd.read_csv(fisher_path, sep='\t')
-
-            significant_domains = fisherResult.domain.to_list()
-            for i in data.index:
-                if data.at[i, 'domain'] in significant_domains:
-                    data.at[i, 'domain_fisher'] = data.at[i, 'domain']
-                else:
-                    data.at[i, 'domain_fisher'] = 'NULL'
-
-            # Change the numbering for binary annotations and create 3 classes:
-            # nan--> 0, 0 -->1 and 1 -->2
-
-            print('Final adjustments are being done...\n')
-            binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary',
-                          'dnaBindingBinary',
-                          'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
-                          'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
-                          'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
-                          'repeatBinary', 'caBindingBinary', 'topologicalDomainBinary',
-                          'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
-                          'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
-                          'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
-                          'glycosylationBinary', 'propeptideBinary']
-            data = data.astype(str)
-            data.replace({'NaN': 'nan'}, inplace=True)
-            for i in data.index:
-                for j in binaryCols:
-                    data[j] = data[j].astype('str')
-                    if (data.at[i, j] == '0') or (data.at[i, j] == '0.0'):
-                        data.at[i, j] = '1'
-                    elif data.at[i, j] == 'nan':
-                        data.at[i, j] = '0'
-                    elif (data.at[i, j] == '1') or (data.at[i, j] == '1.0'):
-                        data.at[i, j] = '2'
-
-            annotCols = ['disulfide', 'intMet', 'intramembrane',
-                         'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
-                         'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
-                         'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding',
-                         'topologicalDomain', 'bindingSite', 'region', 'signalPeptide',
-                         'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
-                         'transitPeptide', 'glycosylation', 'propeptide']
-
-            for i in data.index:
-                for annot in annotCols:
-                    binaryName = str(annot) + 'Binary'
-                    if data.at[i, binaryName] == '2':
-                        data.at[i, annot] = '0.0'
-            data.replace({'100000': 'nan'}, inplace=True)
-            data = add_physicochemical(data)
-            data.rename(
-                columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
-                         'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
-                         'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
-                         'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state',
-                         'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin',
-                         'intramembraneBinary': 'intramembrane_bin',
-                         'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin',
-                         'activeSiteBinary': 'activeSite_bin',
-                         'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin',
-                         'siteBinary': 'site_bin',
-                         'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin',
-                         'mutagenesisBinary': 'mutagenesis_bin',
-                         'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin',
-                         'metalBindingBinary': 'metalBinding_bin',
-                         'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin',
-                         'caBindingBinary': 'caBinding_bin',
-                         'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin',
-                         'signalPeptideBinary': 'signalPeptide_bin',
-                         'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin',
-                         'motifBinary': 'motif_bin',
-                         'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin',
-                         'transitPeptideBinary': 'transitPeptide_bin',
-                         'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin',
-                         'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist',
-                         'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist',
-                         'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist',
-                         'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist',
-                         'site': 'site_dist',
-                         'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist',
-                         'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist',
-                         'turn': 'turn_dist',
-                         'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist',
-                         'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist',
-                         'bindingSite': 'bindingSite_dist', 'region': 'region_dist',
-                         'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist',
-                         'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist',
-                         'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
-                         'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
-
-            data = data[
-                ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity',
-                 'volume',
-                 'granthamScore', 'domains_all',
-                 'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin',
-                 'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin',
-                 'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin',
-                 'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin',
-                 'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin',
-                 'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin',
-                 'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin',
-                 'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin',
-                 'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist',
-                 'intramembrane_dist',
-                 'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist',
-                 'nucleotideBinding_dist', 'lipidation_dist', 'site_dist',
-                 'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist',
-                 'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist',
-                 'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist',
-                 'bindingSite_dist', 'region_dist', 'signalPeptide_dist',
-                 'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist',
-                 'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist',
-                 'glycosylation_dist', 'propeptide_dist']]
-
-            ready = data.copy()
-            # Imputation
-            if (impute == 'True') or (impute == 'true') or (impute == True):
-                filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9,
-                          15.99, 16.82,
-                          20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33,
-                          22.36]
-                col_index = 0
-                for col_ in ready.columns[-30:]:
-                    ready[col_] = ready[col_].fillna(filler[col_index])
-                    ready[col_] = ready[col_].replace({'nan': filler[col_index]})
-                    col_index += 1
-                ready['domains_3Ddist'] = ready['domains_3Ddist'].fillna(24.5)
-                ready['sasa'] = ready['sasa'].fillna(29.5)
-                ready['location_3state'] = ready['location_3state'].fillna('unknown')
-            elif (impute == 'False') or (impute == 'false') or (impute == False):
-                pass
-            ready = ready.replace({'nan': np.NaN})
-            ready = ready.astype(str)
-            ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
-            if len(ready) == 0:
-                print(
-                    'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
-            #st.write(ready)
-            print('Feature vector successfully created...')
-        end = timer()
-        hours, rem = divmod(end - start, 3600)
-        minutes, seconds = divmod(rem, 60)
-        print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
+                except:
+                    ValueError
 
+            if (str(data.at[i, 'domainStartonPDB']) == 'NaN' or str(data.at[i, 'domainStartonPDB']) == 'nan') and (
+                    str(data.at[i, 'domainEndonPDB']) != 'NaN' and str(data.at[i, 'domainEndonPDB']) != 'nan'):
+                data.at[i, 'domainStartonPDB'] = 100000
+            elif (str(data.at[i, 'domainEndonPDB']) == 'NaN' or str(data.at[i, 'domainEndonPDB']) == 'nan') and (
+                    str(data.at[i, 'domainStartonPDB']) != 'NaN' and str(data.at[i, 'domainStartonPDB']) != 'nan'):
+                data.at[i, 'domainEndonPDB'] = 100000
+            elif (str(data.at[i, 'domainStartonPDB']) == 'NaN' and str(data.at[i, 'domainEndonPDB']) == 'nan'):
+                data.at[i, 'domaindistance3D'] = 'nan'
+
+            data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
+                                                 float(data.at[i, 'domainEndonPDB']))
+            data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
+                                                 float(data.at[i, 'domainEndonPDB']))
+
+
+        data = data.astype(str)
+        data.replace({'NaN': 'nan'}, inplace=True)
+
+
+        # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
+
+        # Get interface positions from ECLAIR. Download HQ human
+        print()
+        print('Assigning surface regions...')
+        print('------------------------------------\n')
+
+        print('Extracting interface residues...\n')
+        data_interface = pd.read_csv(path_to_interfaces, sep='\t')
+
+        positions = get_interface_positions(data_interface, 'P1', 'P2')
+
+        interface_dataframe = pd.DataFrame()
+
+        for key, val in positions.items():
+            k = pd.Series((key, str(list(set(val)))))
+            interface_dataframe = interface_dataframe.append(k, ignore_index=True)
+        interface_dataframe.columns = ['uniprotID', 'positions']
+
+        if len(data) == 0:
+            data = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
+                                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                                         'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
+                                         'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
+                                         'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
+                                         'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
+                                         'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                                         'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                                         'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                                         'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
+                                         'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                                         'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                                         'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                                         'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                                         'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                                         'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                                         'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                                         'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                                         'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
+                                         'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
+                                         'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
+        else:
+            data.sasa = data.sasa.astype('str')
+
+        for i in data.index:
+            if '*' in data.at[i, 'sasa']:
+                data.at[i, 'sasa'] = data.at[i, 'sasa'].split('*')[0]
+
+        data.sasa = data.sasa.replace({'N/A': 'nan'})
+        data.sasa = data.sasa.replace({'None': 'nan'})
+        data.replace({'   N/A': 'nan'}, inplace=True)
+        data.replace({'None': 'nan'}, inplace=True)
+        data.sasa = data.sasa.astype(float)
+        data = data.astype(str)
+        for i in data.index:
+            if float(data.at[i, 'sasa']) < 5:
+                data.at[i, 'trsh4'] = 'core'
+            elif float(data.at[i, 'sasa']) >= 5:
+                data.at[i, 'trsh4'] = 'surface'
+            elif data.at[i, 'sasa'] == 'nan':
+                data.at[i, 'trsh4'] = 'nan'
+
+        data = data.merge(interface_dataframe, on='uniprotID', how='left')
+        data.positions = data.positions.astype('str')
+        for i in data.index:
+            if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
+                print((str(data.at[i, 'pos']) in data.at[i, 'positions']))
+                data.at[i, 'threeState_trsh4_HQ'] = 'interface'
+            elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
+                data.at[i, 'threeState_trsh4_HQ'] = 'surface'
+            elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core':
+                data.at[i, 'threeState_trsh4_HQ'] = 'core'
+            elif (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core':
+                data.at[i, 'threeState_trsh4_HQ'] = 'conflict'
+            elif data.at[i, 'trsh4'] == 'nan':
+                data.at[i, 'threeState_trsh4_HQ'] = 'nan'
+
+        data.drop(['positions'], axis=1, inplace=True)
+
+
+        # OPTIONAL
+        # DOMAIN SELECTION
+        # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
+        # significant domains and 53th category will be NULL.
+
+        fisherResult = pd.read_csv(fisher_path, sep='\t')
+
+        significant_domains = fisherResult.domain.to_list()
+        for i in data.index:
+            if data.at[i, 'domain'] in significant_domains:
+                data.at[i, 'domain_fisher'] = data.at[i, 'domain']
+            else:
+                data.at[i, 'domain_fisher'] = 'NULL'
+
+        # Change the numbering for binary annotations and create 3 classes:
+        # nan--> 0, 0 -->1 and 1 -->2
+
+        print('Final adjustments are being done...\n')
+        binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary',
+                      'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                      'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                      'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                      'repeatBinary', 'caBindingBinary', 'topologicalDomainBinary',
+                      'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                      'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                      'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                      'glycosylationBinary', 'propeptideBinary']
+        data = data.astype(str)
+        data.replace({'NaN': 'nan'}, inplace=True)
+        for i in data.index:
+            for j in binaryCols:
+                data[j] = data[j].astype('str')
+                if (data.at[i, j] == '0') or (data.at[i, j] == '0.0'):
+                    data.at[i, j] = '1'
+                elif data.at[i, j] == 'nan':
+                    data.at[i, j] = '0'
+                elif (data.at[i, j] == '1') or (data.at[i, j] == '1.0'):
+                    data.at[i, j] = '2'
+
+        annotCols = ['disulfide', 'intMet', 'intramembrane',
+                     'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
+                     'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
+                     'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding',
+                     'topologicalDomain', 'bindingSite', 'region', 'signalPeptide',
+                     'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
+                     'transitPeptide', 'glycosylation', 'propeptide']
+
+        for i in data.index:
+            for annot in annotCols:
+                binaryName = str(annot) + 'Binary'
+                if data.at[i, binaryName] == '2':
+                    data.at[i, annot] = '0.0'
+        data.replace({'100000': 'nan'}, inplace=True)
+        data = add_physicochemical(data)
+        data.rename(
+            columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
+                     'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
+                     'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
+                     'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state',
+                     'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin',
+                     'intramembraneBinary': 'intramembrane_bin',
+                     'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin',
+                     'activeSiteBinary': 'activeSite_bin',
+                     'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin',
+                     'siteBinary': 'site_bin',
+                     'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin',
+                     'mutagenesisBinary': 'mutagenesis_bin',
+                     'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin',
+                     'metalBindingBinary': 'metalBinding_bin',
+                     'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin',
+                     'caBindingBinary': 'caBinding_bin',
+                     'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin',
+                     'signalPeptideBinary': 'signalPeptide_bin',
+                     'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin',
+                     'motifBinary': 'motif_bin',
+                     'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin',
+                     'transitPeptideBinary': 'transitPeptide_bin',
+                     'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin',
+                     'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist',
+                     'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist',
+                     'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist',
+                     'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist',
+                     'site': 'site_dist',
+                     'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist',
+                     'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist',
+                     'turn': 'turn_dist',
+                     'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist',
+                     'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist',
+                     'bindingSite': 'bindingSite_dist', 'region': 'region_dist',
+                     'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist',
+                     'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist',
+                     'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
+                     'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
+
+        data = data[
+            ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity',
+             'volume',
+             'granthamScore', 'domains_all',
+             'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin',
+             'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin',
+             'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin',
+             'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin',
+             'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin',
+             'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin',
+             'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin',
+             'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin',
+             'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist',
+             'intramembrane_dist',
+             'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist',
+             'nucleotideBinding_dist', 'lipidation_dist', 'site_dist',
+             'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist',
+             'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist',
+             'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist',
+             'bindingSite_dist', 'region_dist', 'signalPeptide_dist',
+             'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist',
+             'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist',
+             'glycosylation_dist', 'propeptide_dist']]
+        ready = data.copy()
+        # Imputation
+        if (impute == 'True') or (impute == 'true'):
+            filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, 16.82,
+                      20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
+            col_index = 0
+            for col_ in ready.columns[-30:]:
+                ready[col_] = ready[col_].fillna(filler[col_index])
+                ready[col_] = ready[col_].replace({'nan': filler[col_index]})
+                col_index += 1
+            ready['domains_3Ddist'] = ready['domains_3Ddist'].fillna(24.5)
+            ready['sasa'] = ready['sasa'].fillna(29.5)
+            ready['location_3state'] = ready['location_3state'].fillna('unknown')
+        elif (impute == 'False') or (impute == 'false'):
+            pass
+        ready = ready.replace({'nan': np.NaN})
+        ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
+        if len(ready) == 0:
+            print('No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
+        print(ready)
+        print('Feature vector successfully created...')
         return ready
 
-    except:
-        AttributeError
-        
-
+    end = timer()
+    hours, rem = divmod(end - start, 3600)
+    minutes, seconds = divmod(rem, 60)
+    print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
+    sys.stdout.close()
+    return ready