Spaces:
Runtime error
Runtime error
Sean MacAvaney
commited on
Commit
·
096a82e
1
Parent(s):
d40a755
minusminus
Browse files
app.py
CHANGED
@@ -1,17 +1,27 @@
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import gradio as gr
|
3 |
-
from pyterrier_doc2query import Doc2Query
|
|
|
4 |
from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_D
|
5 |
|
6 |
MODEL = 'macavaney/doc2query-t5-base-msmarco'
|
|
|
|
|
|
|
7 |
|
8 |
doc2query = Doc2Query(MODEL, append=True, num_samples=5)
|
|
|
|
|
9 |
|
10 |
COLAB_NAME = 'pyterrier_doc2query.ipynb'
|
11 |
COLAB_INSTALL = '''
|
12 |
!pip install -q git+https://github.com/terrier-org/pyterrier
|
13 |
!pip install -q git+https://github.com/terrierteam/pyterrier_doc2query
|
14 |
'''.strip()
|
|
|
15 |
|
16 |
def predict(input, model, append, num_samples):
|
17 |
assert model == MODEL
|
@@ -24,7 +34,68 @@ doc2query = Doc2Query({repr(model)}, append={append}, num_samples={num_samples})
|
|
24 |
|
25 |
doc2query({df2code(input)})
|
26 |
'''
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
interface(
|
30 |
MarkdownFile('README.md'),
|
@@ -48,5 +119,28 @@ interface(
|
|
48 |
label='# Queries'
|
49 |
)],
|
50 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
MarkdownFile('wrapup.md'),
|
52 |
-
).launch(share=
|
|
|
1 |
+
import pyterrier as pt
|
2 |
+
pt.init()
|
3 |
+
import numpy as np
|
4 |
import pandas as pd
|
5 |
import gradio as gr
|
6 |
+
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
|
7 |
+
from pyterrier_dr import ElectraScorer
|
8 |
from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_D
|
9 |
|
10 |
MODEL = 'macavaney/doc2query-t5-base-msmarco'
|
11 |
+
SCORE_MODEL = 'crystina-z/monoELECTRA_LCE_nneg31'
|
12 |
+
PERCENTILES_BY_5 = np.array([-3.80468750e+00, -2.21679688e+00, -1.25683594e+00, -5.58105469e-01, -7.65323639e-04, 4.69482422e-01, 8.83300781e-01, 1.25878906e+00, 1.61035156e+00, 1.94335938e+00, 2.26562500e+00, 2.58007812e+00, 2.89648438e+00, 3.21484375e+00, 3.54687500e+00, 3.90039062e+00, 4.30078125e+00, 4.77343750e+00, 5.37109375e+00])
|
13 |
+
COLORS = ['rgb(252, 132, 100)','rgb(252, 148, 116)','rgb(252, 166, 137)','rgb(252, 183, 156)','rgb(253, 200, 178)','rgb(254, 215, 198)','rgb(255, 228, 216)','rgb(255, 237, 228)','rgb(256, 245, 240)','rgb(256, 256, 256)','rgb(247, 252, 245)','rgb(240, 250, 237)','rgb(233, 247, 228)','rgb(222, 242, 216)','rgb(209, 237, 203)','rgb(195, 232, 188)','rgb(180, 225, 173)','rgb(163, 218, 157)','rgb(145, 210, 142)','rgb(125, 201, 126)']
|
14 |
|
15 |
doc2query = Doc2Query(MODEL, append=True, num_samples=5)
|
16 |
+
electra = ElectraScorer()
|
17 |
+
query_scorer = QueryScorer(electra)
|
18 |
|
19 |
COLAB_NAME = 'pyterrier_doc2query.ipynb'
|
20 |
COLAB_INSTALL = '''
|
21 |
!pip install -q git+https://github.com/terrier-org/pyterrier
|
22 |
!pip install -q git+https://github.com/terrierteam/pyterrier_doc2query
|
23 |
'''.strip()
|
24 |
+
COLAB_INSTALL_MM = COLAB_INSTALL + '\n!pip install -q git+https://github.com/terrierteam/pyterrier_dr faiss-cpu'
|
25 |
|
26 |
def predict(input, model, append, num_samples):
|
27 |
assert model == MODEL
|
|
|
34 |
|
35 |
doc2query({df2code(input)})
|
36 |
'''
|
37 |
+
res = doc2query(input)
|
38 |
+
vis = generate_vis(res)
|
39 |
+
return (doc2query(input), code2md(code, COLAB_INSTALL, COLAB_NAME), vis)
|
40 |
+
|
41 |
+
def generate_vis(df):
|
42 |
+
result = []
|
43 |
+
for row in df.itertuples(index=False):
|
44 |
+
qs = []
|
45 |
+
if hasattr(row, 'querygen_score'):
|
46 |
+
for q, score in zip(row.querygen.split('\n'), row.querygen_score):
|
47 |
+
bucket = np.searchsorted(PERCENTILES_BY_5, score)
|
48 |
+
color = COLORS[bucket]
|
49 |
+
percentile = bucket * 5
|
50 |
+
qs.append(f'''
|
51 |
+
<div>
|
52 |
+
<span title="score={score:.4f}, in the {percentile}th percentile of scores" style="border: 1px solid #888; border-radius: 3px; font-size: 0.6em; font-family: monospace; background-color: {color}; padding: 1px 3px;">{percentile}th</span> {q}
|
53 |
+
</div>
|
54 |
+
''')
|
55 |
+
elif hasattr(row, 'querygen'):
|
56 |
+
for q in row.querygen.split('\n'):
|
57 |
+
qs.append(f'''
|
58 |
+
<div>{q}</div>
|
59 |
+
''')
|
60 |
+
qs = '\n'.join(qs)
|
61 |
+
if qs:
|
62 |
+
qs = f'''
|
63 |
+
<div><strong>Expansion Queries:</strong></div>
|
64 |
+
{qs}
|
65 |
+
'''
|
66 |
+
text = row.text.replace('\n', '<br/>')
|
67 |
+
result.append(f'''
|
68 |
+
<div style="font-size: 1.2em;">Document: <strong>{row.docno}</strong></div>
|
69 |
+
<div style="margin: 4px 0 16px; padding: 4px; border: 1px solid black;">
|
70 |
+
<div>
|
71 |
+
{text}
|
72 |
+
</div>
|
73 |
+
{qs}
|
74 |
+
</div>
|
75 |
+
''')
|
76 |
+
return '\n'.join(result)
|
77 |
+
|
78 |
+
def predict_mm(input, model, num_samples, score_model):
|
79 |
+
assert model == MODEL
|
80 |
+
assert score_model == SCORE_MODEL
|
81 |
+
doc2query.append = False
|
82 |
+
doc2query.num_samples = num_samples
|
83 |
+
pipeline = doc2query >> query_scorer
|
84 |
+
code = f'''import pyterrier as pt ; pt.init()
|
85 |
+
import pandas as pd
|
86 |
+
from pyterrier_doc2query import Doc2Query, QueryScorer
|
87 |
+
from pyterrier_dr import ElectraScorer
|
88 |
+
|
89 |
+
doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples})
|
90 |
+
scorer = ElectraScorer({repr(score_model)})
|
91 |
+
pipeline = doc2query >> QueryScorer(scorer)
|
92 |
+
|
93 |
+
pipeline({df2code(input)})
|
94 |
+
'''
|
95 |
+
res = pipeline(input)
|
96 |
+
vis = generate_vis(res)
|
97 |
+
res['querygen_score'] = res['querygen_score'].apply(lambda x: '[ ' + ', '.join(str(v) for v in x) + ' ]')
|
98 |
+
return (res, code2md(code, COLAB_INSTALL_MM, COLAB_NAME), vis)
|
99 |
|
100 |
interface(
|
101 |
MarkdownFile('README.md'),
|
|
|
119 |
label='# Queries'
|
120 |
)],
|
121 |
),
|
122 |
+
MarkdownFile('mm.md'),
|
123 |
+
Demo(
|
124 |
+
predict_mm,
|
125 |
+
EX_D,
|
126 |
+
[
|
127 |
+
gr.Dropdown(
|
128 |
+
choices=[MODEL],
|
129 |
+
value=MODEL,
|
130 |
+
label='Model',
|
131 |
+
interactive=False,
|
132 |
+
), gr.Slider(
|
133 |
+
minimum=1,
|
134 |
+
maximum=10,
|
135 |
+
value=doc2query.num_samples,
|
136 |
+
step=1.,
|
137 |
+
label='# Queries'
|
138 |
+
), gr.Dropdown(
|
139 |
+
choices=[SCORE_MODEL],
|
140 |
+
value=SCORE_MODEL,
|
141 |
+
label='Filter',
|
142 |
+
interactive=False,
|
143 |
+
)],
|
144 |
+
),
|
145 |
MarkdownFile('wrapup.md'),
|
146 |
+
).launch(share=True)
|
mm.md
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Doc2Query−−: When Less is More
|
2 |
+
|
3 |
+
You might notice that not all the generated queries look related to the source text. This is due
|
4 |
+
to a defect that can appear in sequence-to-sequence models known as "[hallucination](https://aclanthology.org/2020.acl-main.173/)".
|
5 |
+
|
6 |
+
Doc2Query−− can filter out these low-quality queries by measuring the relevance between them and the text that
|
7 |
+
generated them using a scoring transformer `S`. It is applied as two transformers that follow the Doc2Query generator:
|
8 |
+
|
9 |
+
<div class="pipeline">
|
10 |
+
<div class="df" title="Document Frame">D</div>
|
11 |
+
<div class="transformer" title="Doc2Query Transformer">Doc2Query</div>
|
12 |
+
<div class="df" title="Document Frame">D</div>
|
13 |
+
<div class="transformer attn" title="Doc2Query Transformer">QueryScorer
|
14 |
+
<div class="artefact" title="Scorer Transformer">S</div>
|
15 |
+
</div>
|
16 |
+
<div class="df" title="Document Frame">D</div>
|
17 |
+
<div class="transformer attn" title="Doc2Query Transformer">QueryFilter</div>
|
18 |
+
<div class="df" title="Document Frame">D</div>
|
19 |
+
</div>
|
requirements.txt
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
git+https://github.com/seanmacavaney/[email protected]
|
2 |
git+https://github.com/terrier-org/pyterrier
|
3 |
-
git+https://github.com/terrierteam/pyterrier_doc2query@
|
|
|
4 |
ir_datasets
|
5 |
ir_measures
|
|
|
|
1 |
git+https://github.com/seanmacavaney/[email protected]
|
2 |
git+https://github.com/terrier-org/pyterrier
|
3 |
+
git+https://github.com/terrierteam/pyterrier_doc2query@minusminus
|
4 |
+
git+https://github.com/terrierteam/pyterrier_dr
|
5 |
ir_datasets
|
6 |
ir_measures
|
7 |
+
faiss-cpu
|
wrapup.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
### Putting it all together
|
2 |
|
3 |
-
You can use Doc2Query in an indexing pipeline to build an index of the expanded documents:
|
4 |
|
5 |
<div class="pipeline">
|
6 |
<div class="df" title="Document Frame">D</div>
|
7 |
-
<div class="transformer attn" title="Doc2Query Transformer">Doc2Query</div>
|
8 |
<div class="df" title="Document Frame">D</div>
|
9 |
<div class="transformer" title="Indexer">Indexer</div>
|
10 |
<div class="artefact" title="Doc2Query Index">IDX</div>
|
@@ -39,4 +39,5 @@ bm25 = pt.BatchRetrieve('./msmarco_psg', wmodel="BM25")
|
|
39 |
### References & Credits
|
40 |
|
41 |
- Rodrigo Nogueira and Jimmy Lin. [From doc2query to docTTTTTquery](https://cs.uwaterloo.ca/~jimmylin/publications/Nogueira_Lin_2019_docTTTTTquery-v2.pdf).
|
|
|
42 |
- Craig Macdonald, Nicola Tonellotto, Sean MacAvaney, Iadh Ounis. [PyTerrier: Declarative Experimentation in Python from BM25 to Dense Retrieval](https://dl.acm.org/doi/abs/10.1145/3459637.3482013). CIKM 2021.
|
|
|
1 |
### Putting it all together
|
2 |
|
3 |
+
You can use Doc2Query or Doc2Query-- in an indexing pipeline to build an index of the expanded documents:
|
4 |
|
5 |
<div class="pipeline">
|
6 |
<div class="df" title="Document Frame">D</div>
|
7 |
+
<div class="transformer attn" title="Doc2Query or Doc2Query−− Transformer">Doc2Query[−−]</div>
|
8 |
<div class="df" title="Document Frame">D</div>
|
9 |
<div class="transformer" title="Indexer">Indexer</div>
|
10 |
<div class="artefact" title="Doc2Query Index">IDX</div>
|
|
|
39 |
### References & Credits
|
40 |
|
41 |
- Rodrigo Nogueira and Jimmy Lin. [From doc2query to docTTTTTquery](https://cs.uwaterloo.ca/~jimmylin/publications/Nogueira_Lin_2019_docTTTTTquery-v2.pdf).
|
42 |
+
- Mitko Gospodinov, Sean MacAvaney, and Craig Macdonald. Doc2Query--: When Less is More. ECIR 2023.
|
43 |
- Craig Macdonald, Nicola Tonellotto, Sean MacAvaney, Iadh Ounis. [PyTerrier: Declarative Experimentation in Python from BM25 to Dense Retrieval](https://dl.acm.org/doi/abs/10.1145/3459637.3482013). CIKM 2021.
|