wenkai commited on
Commit
309ab27
·
verified ·
1 Parent(s): 4b532c0

Upload 6 files

Browse files
output/cal_f1.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def cal_f1(df, standard=False):
5
+ df['label_list'] = df['label'].apply(lambda x: [i.strip().lower() for i in x.split(';')])
6
+ #df['pred_list_go'] = df['pred'].apply(lambda x: [i.strip() for i in x.split(';')])
7
+ if standard:
8
+ df['pred_list'] = df['pred'].apply(lambda x: [i[0] for i in eval(str(x))])
9
+ else:
10
+ df['pred_list_prob'] = df['pred'].apply(lambda x: [eval(i.strip()) for i in str(x).split(';')])
11
+ df['pred_list'] = df['pred_list_prob'].apply(lambda x: [i[0] for i in x])
12
+
13
+ labels = []
14
+ pred_labels = []
15
+ for l in df['label_list']:
16
+ labels.extend(l)
17
+
18
+ label_count = {}
19
+ for x in labels:
20
+ if x not in label_count:
21
+ label_count[x] = 1
22
+ else:
23
+ label_count[x] += 1
24
+
25
+ labels = list(set(labels))
26
+ total = len(labels)
27
+ tp_dict, fp_dict, fn_dict = dict(zip(labels, [0] * len(labels))), dict(zip(labels, [0] * len(labels))), dict(
28
+ zip(labels, [0] * len(labels)))
29
+ for preds, label in zip(df['pred_list'], df['label_list']):
30
+ for t in label:
31
+ # supgo = godb.get_anchestors(t)
32
+ # if supgo.intersection(set(preds)):
33
+ if t in preds:
34
+ tp_dict[t] += 1
35
+ else:
36
+ fn_dict[t] += 1
37
+ for p in preds:
38
+ # supgo = godb.get_anchestors(p)
39
+ # if not supgo.intersection(set(label)):
40
+ if p not in label:
41
+ if p in fp_dict:
42
+ fp_dict[p] += 1
43
+ else:
44
+ fp_dict[p] = 1
45
+ pred_labels.extend(preds)
46
+ p_total = len(set(pred_labels))
47
+ recall, pr = 0., 0.
48
+ for x in labels:
49
+ recall += tp_dict[x] / (1.0 * (tp_dict[x] + fn_dict[x] + 1e-8))
50
+ pr += tp_dict[x] / (1.0 * (tp_dict[x] + fp_dict[x] + 1e-8))
51
+ r = recall / total
52
+ p = pr / p_total
53
+ f1 = 2 * p * r / (p + r + 1e-8)
54
+
55
+ print("preds not in labels: {}".format(len(list(fp_dict.keys())) - total))
56
+ print("recall:{}; percision:{}; f1 score: {}".format(r, p, f1))
57
+
58
+
59
+ names = ['output_test_mf_exp_493552.txt', 'output_test_mf_exp_445772_pre.txt', 'output_test_mf_exp_445772.txt', 'output_test_mf_exp_486524.txt', 'output_test_mf_493552_standard.csv', 'output_test_mf_445772_standard.csv', 'output_test_mf_exp_445772_withprompt.txt', 'output_test_mf_exp_506753.txt']
60
+ #names = ['output_test_bp_exp_451674.txt', 'output_test_bp_exp_493547_pre.txt', 'output_test_bp_exp_496359_withprompt.txt']
61
+
62
+ for name in names:
63
+ print(name)
64
+ df = pd.read_csv('/cluster/home/wenkai/LAVIS/output/mf_bp_cc/{}'.format(name), sep='|', header=None)
65
+ if df.iloc[0, 0] == 'name':
66
+ df = df[1:]
67
+ #print(df.shape)
68
+ df.columns = ['name', 'pred', 'label']
69
+ if 'standard' in name:
70
+ cal_f1(df, standard=True)
71
+ else:
72
+ cal_f1(df)
73
+
74
+
75
+
output/output_test_mf_445772_standard.csv ADDED
The diff for this file is too large to render. See raw diff
 
output/output_val_mf_445772_standard.csv ADDED
The diff for this file is too large to render. See raw diff
 
projects/blip2/README.md ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models
2
+ This is the official implementation of BLIP-2 [paper](https://arxiv.org/abs/2301.12597), a generic and efficient pre-training strategy that easily harvests development of pretrained vision models and large language models (LLMs) for vision-language pretraining. BLIP-2 beats Flamingo on zero-shot VQAv2 (**65.0** vs **56.3**), establishing new state-of-the-art on zero-shot captioning (on NoCaps **121.6** CIDEr score vs previous best **113.2**). Equipped with powerful LLMs (e.g. OPT, FlanT5), BLIP-2 also unlocks the new **zero-shot instructed vision-to-language generation** capabilities for various interesting applications!
3
+
4
+ <img src="blip2_illustration.png" width="500">
5
+
6
+ ### Install:
7
+ ```
8
+ pip install salesforce-lavis
9
+ ```
10
+ or install from source following LAVIS instruction.
11
+
12
+ ### Demo:
13
+ Try out our [Notebook Demo](https://github.com/salesforce/LAVIS/blob/main/examples/blip2_instructed_generation.ipynb) on instructed vision-to-language generation: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/salesforce/LAVIS/blob/main/examples/blip2_instructed_generation.ipynb)
14
+
15
+
16
+ ### BLIP-2 Model Zoo
17
+ ```python
18
+ # ==================================================
19
+ # Architectures Types
20
+ # ==================================================
21
+ # blip2_opt pretrain_opt2.7b, caption_coco_opt2.7b, pretrain_opt6.7b, caption_coco_opt6.7b
22
+ # blip2_t5 pretrain_flant5xl, caption_coco_flant5xl, pretrain_flant5xxl
23
+ # blip2 pretrain, coco
24
+ ```
25
+ - Use ```pretrained_{LLM}``` model types for zero-shot image-to-text generation with prompts.
26
+ - Use ```caption_coco_{LLM}``` model types to generate coco-style captions.
27
+ - Use ```blip2``` model architecture for image-text feature extraction and retrieval.
28
+
29
+ ### Image-to-text Generation Example
30
+ Let’s see how to use BLIP-2 models to perform zero-shot instructed image-to-text generation. We first load a sample image from local.
31
+ ```python
32
+ import torch
33
+ from PIL import Image
34
+ # setup device to use
35
+ device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
36
+ # load sample image
37
+ raw_image = Image.open("../../docs/_static/merlion.png").convert("RGB")
38
+ display(raw_image.resize((596, 437)))
39
+ ```
40
+
41
+ Then we load a pre-trained BLIP-2 model with its preprocessors (transforms).
42
+ ```python
43
+ import torch
44
+ from lavis.models import load_model_and_preprocess
45
+ # loads BLIP-2 pre-trained model
46
+ model, vis_processors, _ = load_model_and_preprocess(name="blip2_t5", model_type="pretrain_flant5xxl", is_eval=True, device=device)
47
+ # prepare the image
48
+ image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
49
+ ```
50
+
51
+ Given the image and a text prompt, ask the model to generate the response.
52
+ ```python
53
+ model.generate({"image": image, "prompt": "Question: which city is this? Answer:"})
54
+ # 'singapore'
55
+ ```
56
+
57
+ Ask the model to explain its answer.
58
+ ```python
59
+ model.generate({
60
+ "image": image,
61
+ "prompt": "Question: which city is this? Answer: singapore. Question: why?"})
62
+ # 'it has a statue of a merlion'
63
+ ```
64
+
65
+
66
+
67
+
68
+ Ask a follow-up question.
69
+ ```python
70
+ # prepare context prompt
71
+ context = [
72
+ ("which city is this?", "singapore"),
73
+ ("why?", "it has a statue of a merlion"),
74
+ ]
75
+ question = "where is the name merlion coming from?"
76
+ template = "Question: {} Answer: {}."
77
+ prompt = " ".join([template.format(context[i][0], context[i][1]) for i in range(len(context))]) + " Question: " + question + " Answer:"
78
+ print(prompt)
79
+ # generate model's response
80
+ model.generate({"image": image,"prompt": prompt})
81
+ # 'merlion is a portmanteau of mermaid and lion'
82
+ ```
83
+
84
+ ### Feature Extraction Example
85
+ BLIP-2 supports the Unified Feature Extraction Interface of LAVIS. Checkout this [notebook](https://github.com/salesforce/LAVIS/blob/3446bac20c5646d35ae383ebe6d13cec4f8b00cb/examples/blip2_feature_extraction.ipynb) for an example.
86
+
87
+ ### Image-Text Matching Example
88
+ BLIP-2 can compute the image-text matching score using the same interface as BLIP. Checkout this [notebook](https://github.com/salesforce/LAVIS/blob/3446bac20c5646d35ae383ebe6d13cec4f8b00cb/examples/blip2_image_text_matching.ipynb) for an example.
89
+
90
+ ### Benchmark Evaluation
91
+ Follow [Dataset Download](https://opensource.salesforce.com/LAVIS//latest/getting_started.html#auto-downloading-and-loading-datasets) to prepare common vision-language datasets.
92
+
93
+ Run [these scripts](https://github.com/salesforce/LAVIS/tree/main/run_scripts/blip2/eval) for evaluating pretrained and finetuned models.
94
+
95
+ ### Training
96
+ Stage-1 Pre-training (from scratch):
97
+ ```bash run_scripts/blip2/train/pretrain_stage1.sh```
98
+
99
+ Stage-2 Pre-training:
100
+ ```bash run_scripts/blip2/train/pretrain_stage2.sh```
101
+
102
+ Finetune for image captioning:
103
+ ```bash run_scripts/blip2/train/train_caption_coco.sh```
104
+
105
+ The [config files](https://github.com/salesforce/LAVIS/tree/main/lavis/projects/blip2/train) can be modified for customized training.
106
+
107
+ ### Citing BLIP-2
108
+ <pre>
109
+ @inproceedings{li2023blip2,
110
+ title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
111
+ author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
112
+ year={2023},
113
+ booktitle={ICML},
114
+ }</pre>
115
+
116
+ ### 🤗 Hugging Face integration
117
+
118
+ BLIP-2 is integrated into the Hugging Face 🤗 [Transformers](https://github.com/huggingface/transformers) library, and allows to leverage int8 quanitization thanks to [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). This roughly halves the amount of memory required to load the model, without performance degradation.
119
+
120
+ Documentation can be found [here](https://huggingface.co/docs/transformers/main/model_doc/blip-2).
121
+
122
+ Usage in half precision (float16) is as follows:
123
+
124
+ ```
125
+ from PIL import Image
126
+ import requests
127
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration
128
+ import torch
129
+
130
+ device = "cuda" if torch.cuda.is_available() else "cpu"
131
+
132
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
133
+ model = Blip2ForConditionalGeneration.from_pretrained(
134
+ "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
135
+ )
136
+ model.to(device)
137
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
138
+ image = Image.open(requests.get(url, stream=True).raw)
139
+
140
+ inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
141
+
142
+ generated_ids = model.generate(**inputs)
143
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
144
+ print(generated_text)
145
+ ```
146
+
147
+ To leverage the int8 algorithm, you can run the model as follows:
148
+
149
+ ```
150
+ import torch
151
+ import requests
152
+ from PIL import Image
153
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration
154
+
155
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
156
+ model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", load_in_8bit=True, device_map="auto")
157
+
158
+ img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
159
+ raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
160
+
161
+ question = "how many dogs are in the picture?"
162
+ inputs = processor(raw_image, question, return_tensors="pt").to("cuda", torch.float16)
163
+
164
+ out = model.generate(**inputs)
165
+ print(processor.decode(out[0], skip_special_tokens=True))
166
+ ```
167
+
168
+ All models can be found on the [hub](https://huggingface.co/models?other=blip-2).
projects/blip2/blip2_illustration.png ADDED
projects/blip2/model_card.pdf ADDED
Binary file (125 kB). View file