import tarfile, glob, os from biopandas.pdb import PandasPdb import argparse import numpy as np parser = argparse.ArgumentParser(description='ASCARIS') parser.add_argument('-file_name', '--file_name', help='Enter the file tar file name to untar', default=1) args = parser.parse_args() alphafold = args.file_name def threeToOne(variant): if variant == "ALA": variant = "A" elif variant == "ARG": variant = "R" elif variant == "VAL": variant = "V" elif variant == "GLU": variant = "E" elif variant == "PRO": variant = "P" elif variant == "LEU": variant = "L" elif variant == "GLY": variant = "G" elif variant == "ASN": variant = "N" elif variant == "SER": variant = "S" elif variant == "GLN": variant = "Q" elif variant == "THR": variant = "T" elif variant == "MET": variant = "M" elif variant == "LYS": variant = "K" elif variant == "ASP": variant = "D" elif variant == "ILE": variant = "I" elif variant == "PHE": variant = "F" elif variant == "TRP": variant = "W" elif variant == "TYR": variant = "Y" elif variant == "HIS": variant = "H" elif variant == "CYS": variant = "C" elif variant == 'UNK': variant = 'X' elif variant == 'ASX': variant = 'O' return (variant) # Unzip AlphaFold structures def create_file(): os.makedirs('input_files/alphafold_structures/', exist_ok=True) for f in glob.glob(f'input_files/{alphafold}'): with tarfile.open(f) as tar: tar.extractall(f'input_files/alphafold_structures/') # Create summary file alphafold_summary_file = open('input_files/alphafold_summary.txt', 'w') alphafold_summary_file.write('uniprotID\tchain\tsequence\tmodel_num') alphafold_summary_file.write('\n') for f in glob.glob('input_files/alphafold_structures/*pdb*'): str1 = PandasPdb().read_pdb(f) str1 = str1.df['ATOM'] str1 = str1[['alt_loc', 'residue_name', 'residue_number', 'atom_name', 'insertion', 'chain_id']] str1 = str1[str1.atom_name == 'CA'] str1['residue_name'] = str1['residue_name'].apply(lambda x: threeToOne(x)) str1['alt_loc'] = str1['alt_loc'].replace({'': np.NaN}) str1 = str1.drop_duplicates(['residue_name', 'residue_number']) structure_residues_pdb = ''.join(str1.residue_name.to_list()) model_no = f.split('-')[2].strip()[1:] up_name = f.split('-')[1].strip() chain_id = list(set(str1.chain_id.to_list()))[0] alphafold_summary_file.write(up_name) alphafold_summary_file.write('\t') alphafold_summary_file.write(chain_id) alphafold_summary_file.write('\t') alphafold_summary_file.write(structure_residues_pdb) alphafold_summary_file.write('\t') alphafold_summary_file.write(model_no) alphafold_summary_file.write('\n') if __name__ == '__main__': create_file()