Spaces:
Sleeping
Sleeping
Sean MacAvaney
commited on
Commit
Β·
70f85dc
1
Parent(s):
0160adc
initial commit
Browse files- README.md +19 -6
- app.py +64 -0
- packages.txt +5 -0
- requirements.txt +5 -0
- wrapup.md +3 -0
README.md
CHANGED
@@ -1,12 +1,25 @@
|
|
1 |
---
|
2 |
-
title: Retrieve
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: PyTerrier Retrieve
|
3 |
+
emoji: π
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.7
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
# π PyTerrier: Retrieve
|
13 |
+
|
14 |
+
This is a demonstration of [PyTerrier's TerrierRetrieve transformer](https://pyterrier.readthedocs.io/en/latest/terrier-retrieval.html).
|
15 |
+
|
16 |
+
TerrierRetrieve functions as a `QβR` (retrieval, query-to-result) transformer and can be used in pipelines accordingly. For example, you can
|
17 |
+
pipe the results to a transformer such as `get_text` to load the text associated with the document:
|
18 |
+
|
19 |
+
<div class="pipeline">
|
20 |
+
<div class="df" title="Query Frame">Q</div>
|
21 |
+
<div class="transformer attn" title="PisaRetrieve Transformer">TerrierRetrieve</div>
|
22 |
+
<div class="df" title="Result Frame">R</div>
|
23 |
+
<div class="transformer" title="get_text Transformer">get_text</div>
|
24 |
+
<div class="df" title="Result Frame">R</div>
|
25 |
+
</div>
|
app.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import gradio as gr
|
3 |
+
import pyterrier as pt
|
4 |
+
pt.init()
|
5 |
+
from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_Q
|
6 |
+
|
7 |
+
retr = pt.TerrierRetrieve.from_dataset('msmarco_passage', 'terrier_stemmed')
|
8 |
+
|
9 |
+
COLAB_NAME = 'pyterrier_retrieve.ipynb'
|
10 |
+
COLAB_INSTALL = '''
|
11 |
+
!pip install -q python-terrier
|
12 |
+
'''.strip()
|
13 |
+
|
14 |
+
def predict(input, _, wmodel, num_results, pipe_text):
|
15 |
+
retr.controls["wmodel"] = wmodel
|
16 |
+
retr.controls["end"] = str(num_results -1)
|
17 |
+
code = f'''import pandas as pd
|
18 |
+
import pyterrier as pt ; pt.init()
|
19 |
+
|
20 |
+
retr = pt.TerrierRetrieve.from_dataset('msmarco_passage', 'terrier_stemmed', wmodel={repr(wmodel)}, num_results={num_results})
|
21 |
+
'''
|
22 |
+
pipeline = retr
|
23 |
+
if pipe_text:
|
24 |
+
pipeline = pipeline >> pt.text.get_text(pt.get_dataset('irds:msmarco-passage'), 'text')
|
25 |
+
code += f'''
|
26 |
+
pipeline = retr >> pt.text.get_text(pt.get_dataset('irds:msmarco-passage'), 'text')
|
27 |
+
|
28 |
+
pipeline({df2code(input)})'''
|
29 |
+
else:
|
30 |
+
code += f'''
|
31 |
+
retr({df2code(input)})'''
|
32 |
+
res = pipeline(input)
|
33 |
+
res['score'] = res['score'].map(lambda x: round(x, 2))
|
34 |
+
return (res, code2md(code, COLAB_INSTALL, COLAB_NAME))
|
35 |
+
|
36 |
+
interface(
|
37 |
+
MarkdownFile('README.md'),
|
38 |
+
Demo(
|
39 |
+
predict,
|
40 |
+
{k: v for k, v in EX_Q.items() if k != 'antique/train'},
|
41 |
+
[
|
42 |
+
gr.Dropdown(
|
43 |
+
choices=['msmarco-passage stemmed'],
|
44 |
+
value='msmarco-passage stemmed',
|
45 |
+
label='Index',
|
46 |
+
interactive=False,
|
47 |
+
), gr.Dropdown(
|
48 |
+
choices=['TF_IDF', 'BM25', 'PL2', 'DPH'],
|
49 |
+
value='BM25',
|
50 |
+
label='Retrieval Model',
|
51 |
+
), gr.Slider(
|
52 |
+
minimum=1,
|
53 |
+
maximum=10,
|
54 |
+
value=5,
|
55 |
+
step=1.,
|
56 |
+
label='# Results'
|
57 |
+
), gr.Checkbox(
|
58 |
+
value=True,
|
59 |
+
label="Include get_text in pipeline",
|
60 |
+
)],
|
61 |
+
scale=2/3
|
62 |
+
),
|
63 |
+
MarkdownFile('wrapup.md'),
|
64 |
+
).launch(share=True)
|
packages.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openjdk-11-jdk
|
2 |
+
openjdk-11-jre-headless
|
3 |
+
openjdk-11-jre
|
4 |
+
openjdk-11-jre-headless
|
5 |
+
debianutils
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/seanmacavaney/[email protected]
|
2 |
+
git+https://github.com/terrier-org/pyterrier
|
3 |
+
pyterrier-pisa
|
4 |
+
ir_datasets
|
5 |
+
ir_measures
|
wrapup.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
### References & Credits
|
2 |
+
|
3 |
+
- Craig Macdonald, Nicola Tonellotto, Sean MacAvaney, Iadh Ounis. [PyTerrier: Declarative Experimentation in Python from BM25 to Dense Retrieval](https://dl.acm.org/doi/abs/10.1145/3459637.3482013). CIKM 2021.
|