wenkai
/

FAPM

Model card Files Files and versions Community

wenkai commited on Jun 26, 2024

Commit

4b441dc

verified ·

1 Parent(s): ab60ac5

Update app.py

Browse files

Files changed (1) hide show

app.py +209 -30

app.py CHANGED Viewed

@@ -7,23 +7,135 @@ from lavis.models.protein_models.protein_function_opt import Blip2ProteinMistral
 from lavis.models.base_model import FAPMConfig
 import spaces
 import gradio as gr
-from esm_scripts.extract import run_demo
 from esm import pretrained, FastaBatchedDataset
-# from transformers import EsmTokenizer, EsmModel
-# Load the model
-model = Blip2ProteinMistral(config=FAPMConfig(), esm_size='3b')
-model.load_checkpoint("model/checkpoint_mf2.pth")
-model.to('cuda')
 @spaces.GPU
 def generate_caption(protein, prompt):
-    esm_emb = torch.load('data/emb_esm2_3b/P18281.pt')['representations'][36]
-    torch.save(esm_emb, 'data/emb_esm2_3b/example.pt')
     '''
     inputs = tokenizer([protein], return_tensors="pt", padding=True, truncation=True).to('cuda')
     with torch.no_grad():
@@ -32,17 +144,50 @@ def generate_caption(protein, prompt):
     '''
     print("esm embedding generated")
     esm_emb = F.pad(esm_emb.t(), (0, 1024 - len(esm_emb))).t().to('cuda')
-    print("esm embedding processed")
     samples = {'name': ['protein_name'],
                'image': torch.unsqueeze(esm_emb, dim=0),
                'text_input': ['none'],
                'prompt': [prompt]}
-    # Generate the output
-    prediction = model.generate(samples, length_penalty=0., num_beams=15, num_captions=10, temperature=1.,
-                                repetition_penalty=1.0)
-    return prediction
     # return "test"
@@ -51,16 +196,50 @@ description = """Quick demonstration of the FAPM model for protein function pred
 The model used in this app is available at [Hugging Face Model Hub](https://huggingface.co/wenkai/FAPM) and the source code can be found on [GitHub](https://github.com/xiangwenkai/FAPM/tree/main)."""
-iface = gr.Interface(
-    fn=generate_caption,
-    inputs=[gr.Textbox(type="text", label="Upload sequence"), gr.Textbox(type="text", label="Prompt")],
-    outputs=gr.Textbox(label="Generated description"),
-    description=description
-)
-# Launch the interface
-iface.launch()

 from lavis.models.base_model import FAPMConfig
 import spaces
 import gradio as gr
+# from esm_scripts.extract import run_demo
 from esm import pretrained, FastaBatchedDataset
+from data.evaluate_data.utils import Ontology
+import difflib
+import re
+from transformers import MistralForCausalLM
+# Load the trained model
+def get_model(type='Molecule Function'):
+    model = Blip2ProteinMistral(config=FAPMConfig(), esm_size='3b')
+    if type == 'Molecule Function':
+        model.load_checkpoint("model/checkpoint_mf2.pth")
+        model.to('cuda')
+    elif type == 'Biological Process':
+        model.load_checkpoint("model/checkpoint_bp1.pth")
+        model.to('cuda')
+    elif type == 'Cellar Component':
+        model.load_checkpoint("model/checkpoint_cc2.pth")
+        model.to('cuda')
+    return model
+models = {
+    'Molecule Function': get_model('Molecule Function'),
+    'Biological Process': get_model('Biological Process'),
+    'Cellular Component': get_model('Cellar Component'),
+    }
+# Load the mistral model
+mistral_model = MistralForCausalLM.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", torch_dtype=torch.float16).to('cuda')
+# Load ESM2 model
+model_esm, alphabet = pretrained.load_model_and_alphabet('esm2_t36_3B_UR50D')
+model_esm.to('cuda')
+model_esm.eval()
+godb = Ontology(f'data/go1.4-basic.obo', with_rels=True)
+go_des = pd.read_csv('data/go_descriptions1.4.txt', sep='|', header=None)
+go_des.columns = ['id', 'text']
+go_des = go_des.dropna()
+go_des['id'] = go_des['id'].apply(lambda x: re.sub('_', ':', x))
+go_obo_set = set(go_des['id'].tolist())
+go_des['text'] = go_des['text'].apply(lambda x: x.lower())
+GO_dict = dict(zip(go_des['text'], go_des['id']))
+Func_dict = dict(zip(go_des['id'], go_des['text']))
+terms_mf = pd.read_pickle('data/terms/mf_terms.pkl')
+choices_mf = [Func_dict[i] for i in list(set(terms_mf['gos']))]
+choices_mf = {x.lower(): x for x in choices_mf}
+terms_bp = pd.read_pickle('data/terms/bp_terms.pkl')
+choices_bp = [Func_dict[i] for i in list(set(terms_bp['gos']))]
+choices_bp = {x.lower(): x for x in choices_bp}
+terms_cc = pd.read_pickle('data/terms/cc_terms.pkl')
+choices_cc = [Func_dict[i] for i in list(set(terms_cc['gos']))]
+choices_cc = {x.lower(): x for x in choices_cc}
+choices = {
+    'Molecule Function': choices_mf,
+    'Biological Process': choices_bp,
+    'Cellular Component': choices_cc,
+    }
 @spaces.GPU
 def generate_caption(protein, prompt):
+    # Process the image and the prompt
+    # with open('/home/user/app/example.fasta', 'w') as f:
+    #     f.write('>{}\n'.format("protein_name"))
+    #     f.write('{}\n'.format(protein.strip()))
+    # os.system("python esm_scripts/extract.py esm2_t36_3B_UR50D /home/user/app/example.fasta /home/user/app --repr_layers 36 --truncation_seq_length 1024 --include per_tok")
+    # esm_emb = run_demo(protein_name='protein_name', protein_seq=protein,
+    #                    model=model_esm, alphabet=alphabet,
+    #                    include='per_tok', repr_layers=[36], truncation_seq_length=1024)
+    protein_name = 'protein_name'
+    protein_seq = protein
+    include = 'per_tok'
+    repr_layers = [36]
+    truncation_seq_length = 1024
+    toks_per_batch = 4096
+    print("start")
+    dataset = FastaBatchedDataset([protein_name], [protein_seq])
+    print("dataset prepared")
+    batches = dataset.get_batch_indices(toks_per_batch, extra_toks_per_seq=1)
+    print("batches prepared")
+    data_loader = torch.utils.data.DataLoader(
+        dataset, collate_fn=alphabet.get_batch_converter(truncation_seq_length), batch_sampler=batches
+    )
+    print(f"Read sequences")
+    return_contacts = "contacts" in include
+    assert all(-(model_esm.num_layers + 1) <= i <= model_esm.num_layers for i in repr_layers)
+    repr_layers = [(i + model_esm.num_layers + 1) % (model_esm.num_layers + 1) for i in repr_layers]
+    with torch.no_grad():
+        for batch_idx, (labels, strs, toks) in enumerate(data_loader):
+            print(
+                f"Processing {batch_idx + 1} of {len(batches)} batches ({toks.size(0)} sequences)"
+            )
+            if torch.cuda.is_available():
+                toks = toks.to(device="cuda", non_blocking=True)
+            out = model_esm(toks, repr_layers=repr_layers, return_contacts=return_contacts)
+            representations = {
+                layer: t.to(device="cpu") for layer, t in out["representations"].items()
+            }
+            if return_contacts:
+                contacts = out["contacts"].to(device="cpu")
+            for i, label in enumerate(labels):
+                result = {"label": label}
+                truncate_len = min(truncation_seq_length, len(strs[i]))
+                # Call clone on tensors to ensure tensors are not views into a larger representation
+                # See https://github.com/pytorch/pytorch/issues/1995
+                if "per_tok" in include:
+                    result["representations"] = {
+                        layer: t[i, 1: truncate_len + 1].clone()
+                        for layer, t in representations.items()
+                    }
+                if "mean" in include:
+                    result["mean_representations"] = {
+                        layer: t[i, 1: truncate_len + 1].mean(0).clone()
+                        for layer, t in representations.items()
+                    }
+                if "bos" in include:
+                    result["bos_representations"] = {
+                        layer: t[i, 0].clone() for layer, t in representations.items()
+                    }
+                if return_contacts:
+                    result["contacts"] = contacts[i, : truncate_len, : truncate_len].clone()
+            esm_emb = result['representations'][36]
     '''
     inputs = tokenizer([protein], return_tensors="pt", padding=True, truncation=True).to('cuda')
     with torch.no_grad():
     '''
     print("esm embedding generated")
     esm_emb = F.pad(esm_emb.t(), (0, 1024 - len(esm_emb))).t().to('cuda')
+    if prompt is None:
+        prompt = 'none'
+    else:
+        prompt = prompt.lower()
     samples = {'name': ['protein_name'],
                'image': torch.unsqueeze(esm_emb, dim=0),
                'text_input': ['none'],
                'prompt': [prompt]}
+    union_pred_terms = []
+    for model_id in models.keys():
+        model = models[model_id]
+        # Generate the output
+        prediction = model.generate(mistral_model, samples, length_penalty=0., num_beams=15, num_captions=10, temperature=1.,
+                                    repetition_penalty=1.0)
+        x = prediction[0]
+        x = [eval(i) for i in x.split('; ')]
+        pred_terms = []
+        temp = []
+        for i in x:
+            txt = i[0]
+            prob = i[1]
+            sim_list = difflib.get_close_matches(txt.lower(), choices[model_id], n=1, cutoff=0.9)
+            if len(sim_list) > 0:
+                t_standard = sim_list[0]
+                if t_standard not in temp:
+                    pred_terms.append(t_standard+f'({prob})')
+                    temp.append(t_standard)
+        union_pred_terms.append(pred_terms)
+    if prompt == 'none':
+        res_str = "No available predictions for this protein, you can use other two types of model, remove prompt or try another sequence!"
+    else:
+        res_str = "No available predictions for this protein, you can use other two types of model or try another sequence!"
+    if len(union_pred_terms[0]) == 0 and len(union_pred_terms[1]) == 0 and len(union_pred_terms[2]) == 0:
+        return res_str
+    res_str = ''
+    if len(union_pred_terms[0]) != 0:
+        res_str += f"Based on the given amino acid sequence, the protein appears to have a primary function of {', '.join(pred_terms)}. "
+    if len(union_pred_terms[1]) != 0:
+        res_str += f"It is likely involved in the {', '.join(pred_terms)}. "
+    if len(union_pred_terms[2]) != 0:
+        res_str += f"It's subcellular localization is within the {', '.join(pred_terms)}."
+    return res_str
     # return "test"
 The model used in this app is available at [Hugging Face Model Hub](https://huggingface.co/wenkai/FAPM) and the source code can be found on [GitHub](https://github.com/xiangwenkai/FAPM/tree/main)."""
+# iface = gr.Interface(
+#     fn=generate_caption,
+#     inputs=[gr.Textbox(type="text", label="Upload sequence"), gr.Textbox(type="text", label="Prompt")],
+#     outputs=gr.Textbox(label="Generated description"),
+#     description=description
+# )
+# # Launch the interface
+# iface.launch()
+css = """
+  #output {
+    height: 500px;
+    overflow: auto;
+    border: 1px solid #ccc;
+  }
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(description)
+    with gr.Tab(label="Protein caption"):
+        with gr.Row():
+            with gr.Column():
+                input_protein = gr.Textbox(type="text", label="Upload sequence")
+                prompt = gr.Textbox(type="text", label="Taxonomy Prompt (Optional)")
+                submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                output_text = gr.Textbox(label="Output Text")
+        # O14813 train index 127, 266, 738, 1060 test index 4
+        gr.Examples(
+            examples=[
+                ["MDYSYLNSYDSCVAAMEASAYGDFGACSQPGGFQYSPLRPAFPAAGPPCPALGSSNCALGALRDHQPAPYSAVPYKFFPEPSGLHEKRKQRRIRTTFTSAQLKELERVFAETHYPDIYTREELALKIDLTEARVQVWFQNRRAKFRKQERAASAKGAAGAAGAKKGEARCSSEDDDSKESTCSPTPDSTASLPPPPAPGLASPRLSPSPLPVALGSGPGPGPGPQPLKGALWAGVAGGGGGGPGAGAAELLKAWQPAESGPGPFSGVLSSFHRKPGPALKTNLF", ''],
+                ["MKTLALFLVLVCVLGLVQSWEWPWNRKPTKFPIPSPNPRDKWCRLNLGPAWGGRC", ''],
+                ["MAAAGGARLLRAASAVLGGPAGRWLHHAGSRAGSSGLLRNRGPGGSAEASRSLSVSARARSSSEDKITVHFINRDGETLTTKGKVGDSLLDVVVENNLDIDGFGACEGTLACSTCHLIFEDHIYEKLDAITDEENDMLDLAYGLTDRSRLGCQICLTKSMDNMTVRVPETVADARQSIDVGKTS", 'Homo'],
+                ['MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEEGRGNEDRVTLIKDYRGKIETELTKICDGILKLLETHLVPSSTAPESKVFYLKMKGDYYRYLAEFKTGAERKDAAENTMVAYKAAQDIALAELAPTHPIRLGLALNFSVFYYEILNSPDRACSLAKQAFDEAISELDTLSEESYKDSTLIMQLLRDNLTLWTSDISEDPAEEIREAPKRDSSEGQ', 'Zea'],
+                ['MIKAAVTKESLYRMNTLMEAFQGFLGLDLGEFTFKVKPGVFLLTDVKSYLIGDKYDDAFNALIDFVLRNDRDAVEGTETDVSIRLGLSPSDMVVKRQDKTFTFTHGDLEFEVHWINL', 'Bacteriophage'],
+                ['MNDLMIQLLDQFEMGLRERAIKVMATINDEKHRFPMELNKKQCSLMLLGTTDTTTFDMRFNSKKDFPRIKGAREKYPRDAVIEWYHQNWMRTEVKQ', 'Bacteriophage'],
+            ],
+            inputs=[input_protein, prompt],
+            outputs=[output_text],
+            fn=generate_caption,
+            cache_examples=True,
+            label='Try examples'
+        )
+        submit_btn.click(generate_caption, [input_protein, prompt], [output_text])
+demo.launch(debug=True)