JackyZzZzZ commited on
Commit
cbd6355
·
2 Parent(s): d7a02af f989ff9

Merged some changes

Browse files
.github/workflows/ci_tox.yml CHANGED
@@ -24,7 +24,8 @@ jobs:
24
  runs-on: ${{ matrix.os }}
25
  strategy:
26
  matrix:
27
- os: [macOS-latest, windows-latest, ubuntu-latest]
 
28
  python-version: ["3.9", "3.10", "3.11"]
29
 
30
  steps:
 
24
  runs-on: ${{ matrix.os }}
25
  strategy:
26
  matrix:
27
+ # os: [macOS-latest, windows-latest, ubuntu-latest]
28
+ os: [macos-13, windows-latest, ubuntu-latest]
29
  python-version: ["3.9", "3.10", "3.11"]
30
 
31
  steps:
requirements.txt CHANGED
@@ -1,8 +1,9 @@
1
- bitarray~=2.5.1
2
  importlib-resources>=5.6.0
3
  numpy>=1.21.2
4
  pandas>=1.3.5
5
- pytest>=6.2.5
6
  scikit-learn>=1.0.1
7
- scipy==1.11.1
8
  setuptools>=58.0.4
 
 
1
+ bitarray>=2.5.1
2
  importlib-resources>=5.6.0
3
  numpy>=1.21.2
4
  pandas>=1.3.5
5
+ pytest>=7.4.0
6
  scikit-learn>=1.0.1
7
+ scipy>=1.11.1
8
  setuptools>=58.0.4
9
+ streamlit>=1.34.0
requirements_dev.txt CHANGED
@@ -1,4 +1,4 @@
1
- bitarray~=2.5.1
2
  coverage>=6.3.2
3
  hypothesis
4
  importlib-resources>=5.6.0
@@ -6,9 +6,9 @@ numpy>=1.21.2
6
  openpyxl
7
  pandas>=1.3.5
8
  pre-commit
9
- # pytest>=6.2.5
10
- pytest==7.4.0
11
  pytest-cov>=3.0.0
12
  scikit-learn>=1.0.1
13
- scipy==1.11.1
14
  setuptools>=58.0.4
 
 
1
+ bitarray>=2.5.1
2
  coverage>=6.3.2
3
  hypothesis
4
  importlib-resources>=5.6.0
 
6
  openpyxl
7
  pandas>=1.3.5
8
  pre-commit
9
+ pytest>=7.4.0
 
10
  pytest-cov>=3.0.0
11
  scikit-learn>=1.0.1
12
+ scipy>=1.11.1
13
  setuptools>=58.0.4
14
+ streamlit>=1.34.0
setup.py CHANGED
@@ -56,8 +56,8 @@ setup(
56
  # Allows `setup.py test` to work correctly with pytest
57
  setup_requires=[
58
  "numpy>=1.21.2",
59
- "scipy==1.11.1",
60
- "pytest>=6.2.4",
61
  "scikit-learn",
62
  "bitarray",
63
  ]
@@ -66,8 +66,8 @@ setup(
66
  url="https://github.com/theochem/Selector", # Website
67
  install_requires=[
68
  "numpy>=1.21.2",
69
- "scipy==1.11.1",
70
- "pytest>=6.2.4",
71
  "scikit-learn",
72
  "bitarray",
73
  ],
 
56
  # Allows `setup.py test` to work correctly with pytest
57
  setup_requires=[
58
  "numpy>=1.21.2",
59
+ "scipy>=1.11.1",
60
+ "pytest>=7.4.0",
61
  "scikit-learn",
62
  "bitarray",
63
  ]
 
66
  url="https://github.com/theochem/Selector", # Website
67
  install_requires=[
68
  "numpy>=1.21.2",
69
+ "scipy>=1.11.1",
70
+ "pytest>=7.4.0",
71
  "scikit-learn",
72
  "bitarray",
73
  ],
streamlit_app/pages/1_MaxMin.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ import json
5
+ import os
6
+
7
+ from sklearn.metrics import pairwise_distances
8
+ from selector.methods.distance import MaxMin
9
+
10
+ # Get the current directory path
11
+ current_dir = os.path.dirname(os.path.abspath(__file__))
12
+
13
+ # Construct the path to the assets directory
14
+ assets_dir = os.path.join(current_dir, "..", "assets")
15
+
16
+ # Set page configuration
17
+ st.set_page_config(
18
+ page_title="MaxMin",
19
+ page_icon=os.path.join(assets_dir, "QC-Devs.png")
20
+ )
21
+
22
+ st.title("Brute Strength - MaxMin")
23
+
24
+ st.sidebar.header("Brute Strength - MaxMin")
25
+
26
+ st.sidebar.info(
27
+ """
28
+ MaxMin is possibly the most widely used method for dissimilarity-based
29
+ compound selection. When presented with a dataset of samples, the
30
+ initial point is chosen as the dataset's medoid center. Next, the second
31
+ point is chosen to be that which is furthest from this initial point.
32
+ Subsequently, all following points are selected via the following
33
+ logic:
34
+
35
+ 1. Find the minimum distance from every point to the already-selected ones.
36
+ 2. Select the point which has the maximum distance among those calculated
37
+ in the previous step.
38
+
39
+ In the current implementation, this method requires or computes the full pairwise-distance
40
+ matrix, so it is not recommended for large datasets.
41
+ """
42
+ )
43
+
44
+ st.sidebar.title("References")
45
+
46
+ st.sidebar.info("[1] Ashton, Mark, et al., Identification of diverse database subsets using "
47
+ "property‐based and fragment‐based molecular descriptions, "
48
+ "Quantitative Structure‐Activity Relationships 21.6 (2002): 598-604.")
49
+
50
+
51
+ # File uploader for feature matrix or distance matrix (required)
52
+ matrix_file = st.file_uploader("Upload a feature matrix or distance matrix (required)", type=["csv", "xlsx", "npz", "npy"], key="matrix_file")
53
+
54
+ # Clear selected indices if a new matrix file is uploaded
55
+ if matrix_file is None:
56
+ st.session_state.pop("selected_ids", None)
57
+
58
+ # Load data from matrix file
59
+ if matrix_file is not None:
60
+ try:
61
+ header_option = None
62
+ if matrix_file.name.endswith(".csv") or matrix_file.name.endswith(".xlsx"):
63
+ header_option = st.checkbox("Does the file have a header?", key = "header_option")
64
+ st.warning("⚠️ Warning: This will affect the final output if not specified correctly.")
65
+
66
+ if matrix_file.name.endswith(".csv") or matrix_file.name.endswith(".xlsx"):
67
+ if header_option:
68
+ # Load the matrix with header
69
+ matrix = pd.read_csv(matrix_file).values
70
+ else:
71
+ # Load the matrix without header
72
+ matrix = pd.read_csv(matrix_file, header = None).values
73
+ st.write("Matrix shape:", matrix.shape)
74
+ st.write(matrix)
75
+
76
+ elif matrix_file.name.endswith(".npz"):
77
+ matrix_data = np.load(matrix_file)
78
+ # Select the array in the .npz file
79
+ array_names = matrix_data.files
80
+ selected_array = st.selectbox("Select the array to use", array_names)
81
+ matrix = matrix_data[selected_array]
82
+ st.write("Matrix shape:", matrix.shape)
83
+ st.write(matrix)
84
+ elif matrix_file.name.endswith(".npy"):
85
+ matrix = np.load(matrix_file)
86
+ st.write("Matrix shape:", matrix.shape)
87
+ st.write(matrix)
88
+ except Exception as e:
89
+ st.error(f'An error occurred while loading matrix file: {e}')
90
+ matrix = None
91
+
92
+
93
+ # Input for number of points to select (required)
94
+ num_points = st.number_input("Number of points to select", min_value=1, step=1, key="num_points")
95
+
96
+ # Input for cluster label list (optional)
97
+ label_file = st.file_uploader("Upload a cluster label list (optional)", type=["csv", "xlsx"], key="label_file")
98
+ labels = None
99
+ if label_file is not None:
100
+ try:
101
+ label_header_option = None
102
+ if label_file.name.endswith(".csv") or label_file.name.endswith(".xlsx"):
103
+ label_header_option = st.checkbox("Does the file have a header?",
104
+ key = "label_header_option")
105
+ st.warning(
106
+ "⚠️ Warning: This will affect the final output if not specified correctly.")
107
+
108
+ if label_file.name.endswith(".csv") or label_file.name.endswith(".xlsx"):
109
+ if label_header_option:
110
+ labels = pd.read_csv(label_file).values.flatten()
111
+ else:
112
+ labels = pd.read_csv(label_file, header = None).values.flatten()
113
+ st.write("Cluster labels shape:", labels.shape)
114
+ st.write(labels)
115
+ except Exception as e:
116
+ st.error(f'An error occurred while loading cluster label file: {e}')
117
+ labels = None
118
+
119
+
120
+ if st.button("Run MaxMin Algorithm"):
121
+ try:
122
+ # Check if the input matrix is a feature matrix or a distance matrix
123
+ if matrix.shape[0] == matrix.shape[1]:
124
+ # Distance matrix
125
+ selector = MaxMin()
126
+ selected_ids = selector.select(matrix, size = num_points, labels = labels)
127
+ else:
128
+ # Feature matrix
129
+ selector = MaxMin(lambda x: pairwise_distances(x, metric = "euclidean"))
130
+ selected_ids = selector.select(matrix, size = num_points, labels = labels)
131
+
132
+ # Convert selected indices to a list of integers
133
+ selected_ids = [int(i) for i in selected_ids]
134
+
135
+ # Save selected indices to session state
136
+ st.session_state['selected_ids'] = selected_ids
137
+ except ValueError as ve:
138
+ st.error(f"An error occurred while running the MaxMin algorithm: {ve}")
139
+ except Exception as e:
140
+ st.error(f"An error occurred while running the MaxMin algorithm: {e}")
141
+
142
+ # Check if the selected indices are stored in the session state
143
+ if 'selected_ids' in st.session_state and matrix_file is not None:
144
+ selected_ids = st.session_state['selected_ids']
145
+ st.write("Selected indices:", selected_ids)
146
+
147
+ # export format
148
+ export_format = st.selectbox("Select export format", ["CSV", "JSON"], key="export_format")
149
+
150
+ if export_format == "CSV":
151
+ csv_data = pd.DataFrame(selected_ids, columns = ["Selected Indices"])
152
+ csv = csv_data.to_csv(index = False).encode('utf-8')
153
+ st.download_button(
154
+ label = "Download as CSV",
155
+ data = csv,
156
+ file_name = 'selected_indices.csv',
157
+ mime = 'text/csv',
158
+ )
159
+ elif export_format == "JSON":
160
+ json_data = json.dumps({"Selected Indices": selected_ids})
161
+ st.download_button(
162
+ label = "Download as JSON",
163
+ data = json_data,
164
+ file_name = 'selected_indices.json',
165
+ mime = 'application/json',
166
+ )