jacklin64 commited on
Commit
cd46a21
·
1 Parent(s): a30fe33
README.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Aggretriever is an encoder to aggregate both lexical and semantic text information into a single-vector dense vector for dense retrieval, which is finetued on MS MARCO corpus with BM25 negative sampling, following the approach described in [Aggretriever: A Simple Approach to Aggregate Textual Representation for Robust Dense Passage Retrieval](https://arxiv.org/abs/2208.00511).
2
+
3
+ <p align="center">
4
+ <img src="https://raw.githubusercontent.com/castorini/dhr/main/fig/aggretriever_teaser.png" width="600">
5
+ </p>
6
+
7
+ The associated GitHub repository for fine-tuning is available [here](https://github.com/castorini/dhr) and the reproduce from pyserini is [here]. The following variants are also available:
8
+
9
+ Model | Initialization | MARCO Dev | Encoder Path
10
+ |---|---|---|---|---|---
11
+ aggretriever-distilbert | distilbert-base-uncased | 34.1 | [castorini/aggretriever-distilbert](https://huggingface.co/castorini/aggretriever-distilbert)
12
+ aggretriever-cocondenser | Luyu/co-condenser-marco | 36.2 | [castorini/aggretriever-cocondenser](https://huggingface.co/castorini/aggretriever-cocondenser)
13
+
14
+ ## Usage (HuggingFace Transformers)
15
+ Using the model directly available in HuggingFace transformers. We use the implemented Aggretriever from pyserini [here](https://github.com/castorini/pyserini/blob/master/pyserini/encode/_aggretriever.py).
16
+
17
+ ```python
18
+ from pyserini.encode._aggretriever import AggretrieverQueryEncoder
19
+ from pyserini.encode._aggretriever import AggretrieverDocumentEncoder
20
+
21
+ model_name = '/store/scratch/s269lin/experiments/aggretriever/hf_model/aggretriever-cocondenser'
22
+ query_encoder = AggretrieverQueryEncoder(model_name, device='cpu')
23
+ context_encoder = AggretrieverDocumentEncoder(model_name, device='cpu')
24
+
25
+ query = ["Where was Marie Curie born?"]
26
+ contexts = [
27
+ "Maria Sklodowska, later known as Marie Curie, was born on November 7, 1867.",
28
+ "Born in Paris on 15 May 1859, Pierre Curie was the son of Eugène Curie, a doctor of French Catholic origin from Alsace."
29
+ ]
30
+
31
+ # Compute embeddings: take the last-layer hidden state of the [CLS] token
32
+ query_emb = query_encoder.encode(query)
33
+ ctx_emb = context_encoder.encode(contexts)
34
+ # Compute similarity scores using dot product
35
+ score1 = query_emb @ ctx_emb[0] # 45.56658
36
+ score2 = query_emb @ ctx_emb[1] # 45.81762
37
+ ```
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/store2/scratch/s269lin/Aggretriever/results/experiments/msmarco/coCondenser-Aggretriever",
3
+ "architectures": [
4
+ "AggretrieverEncoder"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 3072,
17
+ "label2id": {
18
+ "LABEL_0": 0
19
+ },
20
+ "layer_norm_eps": 1e-12,
21
+ "max_position_embeddings": 512,
22
+ "model_type": "bert",
23
+ "num_attention_heads": 12,
24
+ "num_hidden_layers": 12,
25
+ "output_hidden_states": true,
26
+ "pad_token_id": 0,
27
+ "position_embedding_type": "absolute",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.15.0",
30
+ "type_vocab_size": 2,
31
+ "use_cache": true,
32
+ "vocab_size": 30522
33
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99ed0d5d6a10d669ae95ccf41420dc6f91ef45b7954255c31db1324a5d40a745
3
+ size 438540579
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "name_or_path": "Luyu/co-condenser-marco", "special_tokens_map_file": "/bos/tmp0/luyug/outputs/condenser/models/l2-s6-km-L128-e8-lr1e-4-b256/special_tokens_map.json", "tokenizer_file": null, "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff