chenghao commited on
Commit
1b978d3
·
1 Parent(s): c72b49f

:tada: initial commit

Browse files
Files changed (1) hide show
  1. app.py +126 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import plotly.graph_objects as go
3
+ import numpy as np
4
+ import scipy.integrate as integrate
5
+
6
+ def _false_positive_probability(threshold, b, r):
7
+ def _probability(s):
8
+ return 1 - (1 - s ** float(r)) ** float(b)
9
+ a, err = integrate.quad(_probability, 0.0, threshold)
10
+ return a
11
+
12
+
13
+ def _false_negative_probability(threshold, b, r):
14
+ def _probability(s):
15
+ return 1 - (1 - (1 - s ** float(r)) ** float(b))
16
+
17
+ a, err = integrate.quad(_probability, threshold, 1.0)
18
+ return a
19
+
20
+
21
+ def _optimal_param(threshold, num_perm, false_positive_weight, false_negative_weight):
22
+ """
23
+ Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
24
+ of probabilities of false positive and false negative.
25
+ """
26
+ min_error = float("inf")
27
+ opt = (0, 0)
28
+ for b in range(1, num_perm + 1):
29
+ max_r = int(num_perm / b)
30
+ for r in range(1, max_r + 1):
31
+ fp = _false_positive_probability(threshold, b, r)
32
+ fn = _false_negative_probability(threshold, b, r)
33
+ error = fp * false_positive_weight + fn * false_negative_weight
34
+ if error < min_error:
35
+ min_error = error
36
+ opt = (b, r)
37
+ return opt
38
+
39
+
40
+ col1, col2 = st.columns(2)
41
+ s = col1.slider("Select a Jaccard similarity", 0.0, 1.0, 0.1)
42
+ p = col2.slider("Select a number of permutations", 0, 1000, 10)
43
+ optimal_b, optimal_r = _optimal_param(s, p, 1, 1)
44
+
45
+ b = col1.slider("Select a number of bands", 1, 100, 1)
46
+ r = col2.slider("Select a number of rows per band", 1, 100, 1)
47
+
48
+ col1.metric(label="Optimal number of bands", value=optimal_b)
49
+ col2.metric(label="Optimal number of rows per band", value=optimal_r)
50
+
51
+ st.markdown("---")
52
+
53
+ st.markdown(f"Two documents that have a Jaccard similarity of $s={s}$ will have:")
54
+ st.markdown(f"1. ${s * 100:.2f}\%$ of their k-shingles will be the same")
55
+ st.markdown(f"2. ${s * 100:.2f}\%$ of their k-shingles' hashes will be the same")
56
+ st.markdown(f"4. ${s * 100:.2f}\%$ of the time, a particular hash will be the same for two documents")
57
+ st.markdown(
58
+ f"3. $s^r={100 * s ** r:.2f}\%$ of the time, they will have the same hashes for a particular band of $r={r}$ rows"
59
+ )
60
+ st.markdown(
61
+ f"5. $1 - s^r = {100 * (1 - s ** r):.2f}\%$ of the time, they will have at least one different hash for a particular band"
62
+ )
63
+ st.markdown(
64
+ f"6. $(1 - s^r)^b = {100 * (1 - s ** r)**b:.2f}\%$ of the time, they will have at least one different hash for all $b={b}$ bands"
65
+ )
66
+ st.markdown(
67
+ f"7. $1 - (1 - s^r)^b={100 * (1 - (1 - s ** r)**b):.2f}\%$ of the time, they will have at least one band with the same hashes"
68
+ )
69
+
70
+ t = st.slider("Select a Jaccard similarity threshold", 0.0, 1.0, 0.1)
71
+
72
+ x = np.linspace(0, 1, 1000)
73
+ y = 1 - (1 - x**r) ** b
74
+
75
+ fig = go.Figure(
76
+ data=go.Scatter(
77
+ x=x,
78
+ y=y,
79
+ showlegend=False,
80
+ )
81
+ )
82
+ fig = fig.add_shape(
83
+ type="line",
84
+ x0=t,
85
+ y0=0,
86
+ x1=t,
87
+ y1=1,
88
+ line=dict(
89
+ color="Red",
90
+ width=4,
91
+ ),
92
+ )
93
+ false_positive_x = [d for d in x if d <= t] + [t]
94
+ false_positive_y = [d for i, d in enumerate(y) if x[i] <= t] + [0]
95
+ fig.add_trace(
96
+ go.Scatter(
97
+ x=false_positive_x,
98
+ y=false_positive_y,
99
+ fill="tozeroy",
100
+ fillcolor="rgba(255, 0, 0, 0.2)",
101
+ line_color="rgba(255, 0, 0, 0)",
102
+ showlegend=False,
103
+ )
104
+ )
105
+
106
+ false_negative_x = [d for d in x if d > t]
107
+ false_negative_y = [d for i, d in enumerate(y) if x[i] > t]
108
+ fig.add_trace(
109
+ go.Scatter(
110
+ x=[t] + false_negative_x + [1],
111
+ y=[1] + false_negative_y + [1],
112
+ fill="toself",
113
+ fillcolor="rgba(0, 255, 0, 0.2)",
114
+ line_color="rgba(0, 255, 0, 0)",
115
+ showlegend=False,
116
+ )
117
+ )
118
+
119
+ st.plotly_chart(fig)
120
+
121
+ false_positive = integrate.quad(lambda x: 1 - (1 - x**r) ** b, 0, t)[0]
122
+ false_negative = integrate.quad(lambda x: (1 - x**r) ** b, t, 1)[0]
123
+
124
+ cols = st.columns(2)
125
+ cols[0].metric(label="False positive area", value=f"{false_positive:.2f}")
126
+ cols[1].metric(label="False negative area", value=f"{false_negative:.2f}")