Spaces:
Sleeping
Sleeping
Merged some changes
Browse files- .github/workflows/ci_tox.yml +2 -1
- requirements.txt +4 -3
- requirements_dev.txt +4 -4
- setup.py +4 -4
- streamlit_app/pages/1_MaxMin.py +166 -0
.github/workflows/ci_tox.yml
CHANGED
@@ -24,7 +24,8 @@ jobs:
|
|
24 |
runs-on: ${{ matrix.os }}
|
25 |
strategy:
|
26 |
matrix:
|
27 |
-
os: [macOS-latest, windows-latest, ubuntu-latest]
|
|
|
28 |
python-version: ["3.9", "3.10", "3.11"]
|
29 |
|
30 |
steps:
|
|
|
24 |
runs-on: ${{ matrix.os }}
|
25 |
strategy:
|
26 |
matrix:
|
27 |
+
# os: [macOS-latest, windows-latest, ubuntu-latest]
|
28 |
+
os: [macos-13, windows-latest, ubuntu-latest]
|
29 |
python-version: ["3.9", "3.10", "3.11"]
|
30 |
|
31 |
steps:
|
requirements.txt
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
-
bitarray
|
2 |
importlib-resources>=5.6.0
|
3 |
numpy>=1.21.2
|
4 |
pandas>=1.3.5
|
5 |
-
pytest>=
|
6 |
scikit-learn>=1.0.1
|
7 |
-
scipy
|
8 |
setuptools>=58.0.4
|
|
|
|
1 |
+
bitarray>=2.5.1
|
2 |
importlib-resources>=5.6.0
|
3 |
numpy>=1.21.2
|
4 |
pandas>=1.3.5
|
5 |
+
pytest>=7.4.0
|
6 |
scikit-learn>=1.0.1
|
7 |
+
scipy>=1.11.1
|
8 |
setuptools>=58.0.4
|
9 |
+
streamlit>=1.34.0
|
requirements_dev.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
bitarray
|
2 |
coverage>=6.3.2
|
3 |
hypothesis
|
4 |
importlib-resources>=5.6.0
|
@@ -6,9 +6,9 @@ numpy>=1.21.2
|
|
6 |
openpyxl
|
7 |
pandas>=1.3.5
|
8 |
pre-commit
|
9 |
-
|
10 |
-
pytest==7.4.0
|
11 |
pytest-cov>=3.0.0
|
12 |
scikit-learn>=1.0.1
|
13 |
-
scipy
|
14 |
setuptools>=58.0.4
|
|
|
|
1 |
+
bitarray>=2.5.1
|
2 |
coverage>=6.3.2
|
3 |
hypothesis
|
4 |
importlib-resources>=5.6.0
|
|
|
6 |
openpyxl
|
7 |
pandas>=1.3.5
|
8 |
pre-commit
|
9 |
+
pytest>=7.4.0
|
|
|
10 |
pytest-cov>=3.0.0
|
11 |
scikit-learn>=1.0.1
|
12 |
+
scipy>=1.11.1
|
13 |
setuptools>=58.0.4
|
14 |
+
streamlit>=1.34.0
|
setup.py
CHANGED
@@ -56,8 +56,8 @@ setup(
|
|
56 |
# Allows `setup.py test` to work correctly with pytest
|
57 |
setup_requires=[
|
58 |
"numpy>=1.21.2",
|
59 |
-
"scipy
|
60 |
-
"pytest>=
|
61 |
"scikit-learn",
|
62 |
"bitarray",
|
63 |
]
|
@@ -66,8 +66,8 @@ setup(
|
|
66 |
url="https://github.com/theochem/Selector", # Website
|
67 |
install_requires=[
|
68 |
"numpy>=1.21.2",
|
69 |
-
"scipy
|
70 |
-
"pytest>=
|
71 |
"scikit-learn",
|
72 |
"bitarray",
|
73 |
],
|
|
|
56 |
# Allows `setup.py test` to work correctly with pytest
|
57 |
setup_requires=[
|
58 |
"numpy>=1.21.2",
|
59 |
+
"scipy>=1.11.1",
|
60 |
+
"pytest>=7.4.0",
|
61 |
"scikit-learn",
|
62 |
"bitarray",
|
63 |
]
|
|
|
66 |
url="https://github.com/theochem/Selector", # Website
|
67 |
install_requires=[
|
68 |
"numpy>=1.21.2",
|
69 |
+
"scipy>=1.11.1",
|
70 |
+
"pytest>=7.4.0",
|
71 |
"scikit-learn",
|
72 |
"bitarray",
|
73 |
],
|
streamlit_app/pages/1_MaxMin.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
|
7 |
+
from sklearn.metrics import pairwise_distances
|
8 |
+
from selector.methods.distance import MaxMin
|
9 |
+
|
10 |
+
# Get the current directory path
|
11 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
12 |
+
|
13 |
+
# Construct the path to the assets directory
|
14 |
+
assets_dir = os.path.join(current_dir, "..", "assets")
|
15 |
+
|
16 |
+
# Set page configuration
|
17 |
+
st.set_page_config(
|
18 |
+
page_title="MaxMin",
|
19 |
+
page_icon=os.path.join(assets_dir, "QC-Devs.png")
|
20 |
+
)
|
21 |
+
|
22 |
+
st.title("Brute Strength - MaxMin")
|
23 |
+
|
24 |
+
st.sidebar.header("Brute Strength - MaxMin")
|
25 |
+
|
26 |
+
st.sidebar.info(
|
27 |
+
"""
|
28 |
+
MaxMin is possibly the most widely used method for dissimilarity-based
|
29 |
+
compound selection. When presented with a dataset of samples, the
|
30 |
+
initial point is chosen as the dataset's medoid center. Next, the second
|
31 |
+
point is chosen to be that which is furthest from this initial point.
|
32 |
+
Subsequently, all following points are selected via the following
|
33 |
+
logic:
|
34 |
+
|
35 |
+
1. Find the minimum distance from every point to the already-selected ones.
|
36 |
+
2. Select the point which has the maximum distance among those calculated
|
37 |
+
in the previous step.
|
38 |
+
|
39 |
+
In the current implementation, this method requires or computes the full pairwise-distance
|
40 |
+
matrix, so it is not recommended for large datasets.
|
41 |
+
"""
|
42 |
+
)
|
43 |
+
|
44 |
+
st.sidebar.title("References")
|
45 |
+
|
46 |
+
st.sidebar.info("[1] Ashton, Mark, et al., Identification of diverse database subsets using "
|
47 |
+
"property‐based and fragment‐based molecular descriptions, "
|
48 |
+
"Quantitative Structure‐Activity Relationships 21.6 (2002): 598-604.")
|
49 |
+
|
50 |
+
|
51 |
+
# File uploader for feature matrix or distance matrix (required)
|
52 |
+
matrix_file = st.file_uploader("Upload a feature matrix or distance matrix (required)", type=["csv", "xlsx", "npz", "npy"], key="matrix_file")
|
53 |
+
|
54 |
+
# Clear selected indices if a new matrix file is uploaded
|
55 |
+
if matrix_file is None:
|
56 |
+
st.session_state.pop("selected_ids", None)
|
57 |
+
|
58 |
+
# Load data from matrix file
|
59 |
+
if matrix_file is not None:
|
60 |
+
try:
|
61 |
+
header_option = None
|
62 |
+
if matrix_file.name.endswith(".csv") or matrix_file.name.endswith(".xlsx"):
|
63 |
+
header_option = st.checkbox("Does the file have a header?", key = "header_option")
|
64 |
+
st.warning("⚠️ Warning: This will affect the final output if not specified correctly.")
|
65 |
+
|
66 |
+
if matrix_file.name.endswith(".csv") or matrix_file.name.endswith(".xlsx"):
|
67 |
+
if header_option:
|
68 |
+
# Load the matrix with header
|
69 |
+
matrix = pd.read_csv(matrix_file).values
|
70 |
+
else:
|
71 |
+
# Load the matrix without header
|
72 |
+
matrix = pd.read_csv(matrix_file, header = None).values
|
73 |
+
st.write("Matrix shape:", matrix.shape)
|
74 |
+
st.write(matrix)
|
75 |
+
|
76 |
+
elif matrix_file.name.endswith(".npz"):
|
77 |
+
matrix_data = np.load(matrix_file)
|
78 |
+
# Select the array in the .npz file
|
79 |
+
array_names = matrix_data.files
|
80 |
+
selected_array = st.selectbox("Select the array to use", array_names)
|
81 |
+
matrix = matrix_data[selected_array]
|
82 |
+
st.write("Matrix shape:", matrix.shape)
|
83 |
+
st.write(matrix)
|
84 |
+
elif matrix_file.name.endswith(".npy"):
|
85 |
+
matrix = np.load(matrix_file)
|
86 |
+
st.write("Matrix shape:", matrix.shape)
|
87 |
+
st.write(matrix)
|
88 |
+
except Exception as e:
|
89 |
+
st.error(f'An error occurred while loading matrix file: {e}')
|
90 |
+
matrix = None
|
91 |
+
|
92 |
+
|
93 |
+
# Input for number of points to select (required)
|
94 |
+
num_points = st.number_input("Number of points to select", min_value=1, step=1, key="num_points")
|
95 |
+
|
96 |
+
# Input for cluster label list (optional)
|
97 |
+
label_file = st.file_uploader("Upload a cluster label list (optional)", type=["csv", "xlsx"], key="label_file")
|
98 |
+
labels = None
|
99 |
+
if label_file is not None:
|
100 |
+
try:
|
101 |
+
label_header_option = None
|
102 |
+
if label_file.name.endswith(".csv") or label_file.name.endswith(".xlsx"):
|
103 |
+
label_header_option = st.checkbox("Does the file have a header?",
|
104 |
+
key = "label_header_option")
|
105 |
+
st.warning(
|
106 |
+
"⚠️ Warning: This will affect the final output if not specified correctly.")
|
107 |
+
|
108 |
+
if label_file.name.endswith(".csv") or label_file.name.endswith(".xlsx"):
|
109 |
+
if label_header_option:
|
110 |
+
labels = pd.read_csv(label_file).values.flatten()
|
111 |
+
else:
|
112 |
+
labels = pd.read_csv(label_file, header = None).values.flatten()
|
113 |
+
st.write("Cluster labels shape:", labels.shape)
|
114 |
+
st.write(labels)
|
115 |
+
except Exception as e:
|
116 |
+
st.error(f'An error occurred while loading cluster label file: {e}')
|
117 |
+
labels = None
|
118 |
+
|
119 |
+
|
120 |
+
if st.button("Run MaxMin Algorithm"):
|
121 |
+
try:
|
122 |
+
# Check if the input matrix is a feature matrix or a distance matrix
|
123 |
+
if matrix.shape[0] == matrix.shape[1]:
|
124 |
+
# Distance matrix
|
125 |
+
selector = MaxMin()
|
126 |
+
selected_ids = selector.select(matrix, size = num_points, labels = labels)
|
127 |
+
else:
|
128 |
+
# Feature matrix
|
129 |
+
selector = MaxMin(lambda x: pairwise_distances(x, metric = "euclidean"))
|
130 |
+
selected_ids = selector.select(matrix, size = num_points, labels = labels)
|
131 |
+
|
132 |
+
# Convert selected indices to a list of integers
|
133 |
+
selected_ids = [int(i) for i in selected_ids]
|
134 |
+
|
135 |
+
# Save selected indices to session state
|
136 |
+
st.session_state['selected_ids'] = selected_ids
|
137 |
+
except ValueError as ve:
|
138 |
+
st.error(f"An error occurred while running the MaxMin algorithm: {ve}")
|
139 |
+
except Exception as e:
|
140 |
+
st.error(f"An error occurred while running the MaxMin algorithm: {e}")
|
141 |
+
|
142 |
+
# Check if the selected indices are stored in the session state
|
143 |
+
if 'selected_ids' in st.session_state and matrix_file is not None:
|
144 |
+
selected_ids = st.session_state['selected_ids']
|
145 |
+
st.write("Selected indices:", selected_ids)
|
146 |
+
|
147 |
+
# export format
|
148 |
+
export_format = st.selectbox("Select export format", ["CSV", "JSON"], key="export_format")
|
149 |
+
|
150 |
+
if export_format == "CSV":
|
151 |
+
csv_data = pd.DataFrame(selected_ids, columns = ["Selected Indices"])
|
152 |
+
csv = csv_data.to_csv(index = False).encode('utf-8')
|
153 |
+
st.download_button(
|
154 |
+
label = "Download as CSV",
|
155 |
+
data = csv,
|
156 |
+
file_name = 'selected_indices.csv',
|
157 |
+
mime = 'text/csv',
|
158 |
+
)
|
159 |
+
elif export_format == "JSON":
|
160 |
+
json_data = json.dumps({"Selected Indices": selected_ids})
|
161 |
+
st.download_button(
|
162 |
+
label = "Download as JSON",
|
163 |
+
data = json_data,
|
164 |
+
file_name = 'selected_indices.json',
|
165 |
+
mime = 'application/json',
|
166 |
+
)
|