atrytone commited on
Commit
019f2ed
Β·
1 Parent(s): 9ff7fca

Upload Build_VecStore.ipynb

Browse files
Files changed (1) hide show
  1. Build_VecStore.ipynb +282 -0
Build_VecStore.ipynb ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "QS0v2bceN4Or"
7
+ },
8
+ "source": [
9
+ "Builds a database of vector embeddings from list of abstracts"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "l5RwcIG8OAjX"
16
+ },
17
+ "source": [
18
+ "## Some Setup"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "metadata": {
25
+ "id": "sfwT5YW2JCnu"
26
+ },
27
+ "outputs": [],
28
+ "source": [
29
+ "!pip install transformers==4.28.0\n",
30
+ "!pip install -U sentence-transformers\n",
31
+ "!pip install datasets\n",
32
+ "!pip install langchain\n",
33
+ "!pip install torch\n",
34
+ "!pip install faiss-cpu"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {
41
+ "id": "psoTvOp4VkBE"
42
+ },
43
+ "outputs": [],
44
+ "source": [
45
+ "import os\n",
46
+ "import shutil\n",
47
+ "\n",
48
+ "import numpy as np\n",
49
+ "import pandas as pd\n",
50
+ "from tqdm.auto import tqdm\n",
51
+ "import torch"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": null,
57
+ "metadata": {
58
+ "id": "arZiN8QRHS_a"
59
+ },
60
+ "outputs": [],
61
+ "source": [
62
+ "import locale\n",
63
+ "locale.getpreferredencoding = lambda: \"UTF-8\""
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": null,
69
+ "metadata": {
70
+ "id": "JwWs0-Uu6ohg"
71
+ },
72
+ "outputs": [],
73
+ "source": [
74
+ "from transformers import AutoTokenizer, BertForSequenceClassification\n",
75
+ "\n",
76
+ "m_tokenizer = AutoTokenizer.from_pretrained(\"biodatlab/MIReAD-Neuro-Large\")\n",
77
+ "m_model = BertForSequenceClassification.from_pretrained(\"biodatlab/MIReAD-Neuro-Large\")\n",
78
+ "miread_bundle = (m_tokenizer,m_model)"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": null,
84
+ "metadata": {
85
+ "id": "BR-adEUUz9su"
86
+ },
87
+ "outputs": [],
88
+ "source": [
89
+ "def create_lbert_embed(sents,bundle):\n",
90
+ " tokenizer = bundle[0]\n",
91
+ " model = bundle[1]\n",
92
+ " model.cuda()\n",
93
+ " tokens = tokenizer(sents,padding=True,truncation=True,return_tensors='pt')\n",
94
+ " device = torch.device('cuda')\n",
95
+ " tokens = tokens.to(device)\n",
96
+ " with torch.no_grad():\n",
97
+ " embeds = model(**tokens, output_hidden_states=True,return_dict=True).pooler_output\n",
98
+ " return embeds.cpu()\n",
99
+ "\n",
100
+ "def create_miread_embed(sents,bundle):\n",
101
+ " tokenizer = bundle[0]\n",
102
+ " model = bundle[1]\n",
103
+ " model.cuda()\n",
104
+ " tokens = tokenizer(sents,\n",
105
+ " max_length=512,\n",
106
+ " padding=True,\n",
107
+ " truncation=True,\n",
108
+ " return_tensors=\"pt\"\n",
109
+ " )\n",
110
+ " device = torch.device('cuda')\n",
111
+ " tokens = tokens.to(device)\n",
112
+ " with torch.no_grad():\n",
113
+ " out = model.bert(**tokens)\n",
114
+ " feature = out.last_hidden_state[:, 0, :]\n",
115
+ " return feature.cpu()"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": null,
121
+ "metadata": {
122
+ "id": "-wHpHmD3zNSR"
123
+ },
124
+ "outputs": [],
125
+ "source": [
126
+ "from langchain.vectorstores import FAISS\n",
127
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
128
+ "\n",
129
+ "model_name = \"biodatlab/MIReAD-Neuro\"\n",
130
+ "model_kwargs = {'device': 'cuda'}\n",
131
+ "encode_kwargs = {'normalize_embeddings': False}\n",
132
+ "faiss_embedder = HuggingFaceEmbeddings(\n",
133
+ " model_name=model_name,\n",
134
+ " model_kwargs=model_kwargs,\n",
135
+ " encode_kwargs=encode_kwargs\n",
136
+ ")\n",
137
+ "\n",
138
+ "def add_to_db(data,create_embed,bundle,name=''):\n",
139
+ " batch_size = 128\n",
140
+ " \"\"\"\n",
141
+ " data : list of rows with an 'abstract' and an 'identifier' field\n",
142
+ " index : pinecone Index object\n",
143
+ " create_embed : function that creates the embedding given an abstract\n",
144
+ " \"\"\"\n",
145
+ " res = []\n",
146
+ " vecdb = None\n",
147
+ " for i in tqdm(range(0, len(data), batch_size)):\n",
148
+ " # find end of batch\n",
149
+ " i_end = min(i+batch_size, len(data))\n",
150
+ " # create IDs batch\n",
151
+ " ids = [name + '-' + str(x) for x in range(i, i_end)]\n",
152
+ " # create metadata batch\n",
153
+ " metadatas = [{\n",
154
+ " 'journal':row.get('journal','None'),\n",
155
+ " 'title':row['title'],\n",
156
+ " 'abstract': row['abstract'],\n",
157
+ " 'authors':row.get('authors','None'),\n",
158
+ " 'link':row.get('link','None'),\n",
159
+ " 'date':row.get('date','None'),\n",
160
+ " 'submitter':row.get('submitter','None'),\n",
161
+ " } for row in data[i:i_end]]\n",
162
+ " # create embeddings\n",
163
+ " em = [create_embed(row['abstract'],bundle).tolist()[0] for row in data[i:i_end]]\n",
164
+ " texts = [row['abstract'] for row in data[i:i_end]]\n",
165
+ " records = list(zip(texts, em))\n",
166
+ " if vecdb:\n",
167
+ " vecdb_batch = FAISS.from_embeddings(records,faiss_embedder,metadatas=metadatas,ids=ids)\n",
168
+ " vecdb.merge_from(vecdb_batch)\n",
169
+ " else:\n",
170
+ " vecdb = FAISS.from_embeddings(records,faiss_embedder,metadatas=metadatas,ids=ids)\n",
171
+ " return vecdb"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "metadata": {
178
+ "id": "PfsK3DE4MMou"
179
+ },
180
+ "outputs": [],
181
+ "source": [
182
+ "nbdt_data = pd.read_json('data_final.json')\n",
183
+ "aliases = pd.read_csv('id_list.csv')"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": null,
189
+ "metadata": {
190
+ "id": "JrGJh5XgNPvU"
191
+ },
192
+ "outputs": [],
193
+ "source": [
194
+ "aliases = aliases.drop_duplicates('Full Name')\n",
195
+ "aliases.head()"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {
202
+ "id": "CShYwGwWMZh5"
203
+ },
204
+ "outputs": [],
205
+ "source": [
206
+ "nbdt_data.head()"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": null,
212
+ "metadata": {
213
+ "id": "SziJtbggMuyn"
214
+ },
215
+ "outputs": [],
216
+ "source": [
217
+ "def load_nbdt(data,aliases):\n",
218
+ " nbdt_records = []\n",
219
+ " urls = []\n",
220
+ " no_abst_count = 0\n",
221
+ " no_journal_count = 0\n",
222
+ " for row in aliases.itertuples():\n",
223
+ " name = row[1]\n",
224
+ " auth_ids = eval(row[2])\n",
225
+ " auth_ids = [int(x) for x in auth_ids]\n",
226
+ " papers = nbdt_data.loc[nbdt_data['authorId'].isin(auth_ids)]['papers']\n",
227
+ " all_papers = []\n",
228
+ " for paper_set in papers:\n",
229
+ " all_papers.extend(paper_set)\n",
230
+ " for paper in all_papers:\n",
231
+ " url = paper['url']\n",
232
+ " title = paper['title']\n",
233
+ " abst = paper['abstract']\n",
234
+ " year = paper['year']\n",
235
+ " journal = paper.get('journal')\n",
236
+ " if journal:\n",
237
+ " journal = journal.get('name')\n",
238
+ " else:\n",
239
+ " journal = 'None'\n",
240
+ " no_journal_count += 1\n",
241
+ " authors = [name]\n",
242
+ " if not(abst):\n",
243
+ " abst = ''\n",
244
+ " no_abst_count += 1\n",
245
+ " record = {'journal':journal,'title':title,'abstract':abst,'link':url,'date':year,'authors':authors,'submitter':'None'}\n",
246
+ " if url not in urls:\n",
247
+ " nbdt_records.append(record)\n",
248
+ " urls.append(url)\n",
249
+ " return nbdt_records, (no_abst_count,no_journal_count)\n",
250
+ "nbdt_recs, no_counts = load_nbdt(nbdt_data,aliases)"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": null,
256
+ "metadata": {
257
+ "id": "IovTlDINc2Ds"
258
+ },
259
+ "outputs": [],
260
+ "source": [
261
+ "nbdt_db = add_to_db(nbdt_recs,create_miread_embed,miread_bundle,'nbdt')\n",
262
+ "nbdt_db.save_local(\"nbdt_index\")"
263
+ ]
264
+ }
265
+ ],
266
+ "metadata": {
267
+ "accelerator": "GPU",
268
+ "colab": {
269
+ "gpuType": "T4",
270
+ "provenance": []
271
+ },
272
+ "kernelspec": {
273
+ "display_name": "Python 3",
274
+ "name": "python3"
275
+ },
276
+ "language_info": {
277
+ "name": "python"
278
+ }
279
+ },
280
+ "nbformat": 4,
281
+ "nbformat_minor": 0
282
+ }