Upload 3 files
Browse files- scripts/app.py +115 -0
- scripts/fibropred_model.py +195 -0
- scripts/model_utils.py +34 -0
scripts/app.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from model_utils import load_all_models, predict_with_model
|
3 |
+
|
4 |
+
# Load all models
|
5 |
+
models, model_features = load_all_models()
|
6 |
+
|
7 |
+
# Mapeo de nombres amigables a nombres reales
|
8 |
+
MODEL_MAPPING = {
|
9 |
+
"Death": "Death_random_forest_model",
|
10 |
+
"Binary diagnosis": "Binary diagnosis_random_forest_model",
|
11 |
+
"Necessity of transplantation": "Necessity of transplantation_random_forest_model",
|
12 |
+
"Progressive disease": "Progressive disease_random_forest_model"
|
13 |
+
}
|
14 |
+
|
15 |
+
# Invertir el mapeo (opcional para facilidad)
|
16 |
+
INVERSE_MODEL_MAPPING = {v: k for k, v in MODEL_MAPPING.items()}
|
17 |
+
|
18 |
+
# Feature sets for each target variable
|
19 |
+
FEATURES = {
|
20 |
+
"Death": [
|
21 |
+
'Pedigree', 'Age at diagnosis', 'FVC (L) at diagnosis',
|
22 |
+
'FVC (%) at diagnosis', 'DLCO (%) at diagnosis', 'RadioWorsening2y',
|
23 |
+
'Severity of telomere shortening - Transform 4', 'Progressive disease'
|
24 |
+
],
|
25 |
+
"Binary diagnosis": [
|
26 |
+
'Pedigree', 'Age at diagnosis', 'Antifibrotic Drug',
|
27 |
+
'Prednisone', 'Mycophenolate', 'FVC (L) at diagnosis',
|
28 |
+
'FVC (%) at diagnosis', 'DLCO (%) at diagnosis'
|
29 |
+
],
|
30 |
+
"Necessity of transplantation": [
|
31 |
+
'Pedigree','Age at diagnosis','FVC (L) at diagnosis', 'FVC (%) at diagnosis', 'DLCO (%) at diagnosis',
|
32 |
+
'FVC (L) 1 year after diagnosis','FVC (%) 1 year after diagnosis','DLCO (%) 1 year after diagnosis',
|
33 |
+
'RadioWorsening2y'
|
34 |
+
],
|
35 |
+
"Progressive disease": [
|
36 |
+
'Pedigree', 'Age at diagnosis', 'FVC (L) at diagnosis','FVC (%) at diagnosis', 'DLCO (%) at diagnosis','FVC (L) 1 year after diagnosis',
|
37 |
+
'FVC (%) 1 year after diagnosis', 'DLCO (%) 1 year after diagnosis',
|
38 |
+
'RadioWorsening2y', 'Genetic mutation studied in patient'
|
39 |
+
]
|
40 |
+
|
41 |
+
}
|
42 |
+
|
43 |
+
FEATURE_RANGES = {
|
44 |
+
'Pedigree': (0, 67),
|
45 |
+
'Age at diagnosis': (0, 200),
|
46 |
+
'FVC (L) at diagnosis': (0.0, 5.0),
|
47 |
+
'FVC (%) at diagnosis': (0.0, 200.0),
|
48 |
+
'DLCO (%) at diagnosis': (0.0, 200.0),
|
49 |
+
'RadioWorsening2y': (0, 3),
|
50 |
+
'Severity of telomere shortening - Transform 4': (1, 6),
|
51 |
+
'Progressive disease': (0, 1),
|
52 |
+
'Antifibrotic Drug': (0, 1),
|
53 |
+
'Prednisone': (0, 1),
|
54 |
+
'Mycophenolate': (0, 1),
|
55 |
+
'FVC (L) 1 year after diagnosis': (0.0, 5.0),
|
56 |
+
'FVC (%) 1 year after diagnosis': (0.0, 200.0),
|
57 |
+
'DLCO (%) 1 year after diagnosis': (0.0, 200.0),
|
58 |
+
'Genetic mutation studied in patient': (0, 1),
|
59 |
+
'Comorbidities': (0, 1)
|
60 |
+
}
|
61 |
+
|
62 |
+
|
63 |
+
# Define prediction function
|
64 |
+
def make_prediction(input_features, friendly_model_name):
|
65 |
+
"""
|
66 |
+
Predict using the selected model and input features.
|
67 |
+
"""
|
68 |
+
# Map the friendly model name to the real model name
|
69 |
+
target_model = MODEL_MAPPING.get(friendly_model_name)
|
70 |
+
if target_model not in models:
|
71 |
+
return f"Model '{friendly_model_name}' not found. Please select a valid model."
|
72 |
+
|
73 |
+
model = models[target_model]
|
74 |
+
features = model_features[target_model]
|
75 |
+
|
76 |
+
if len(input_features) != len(features):
|
77 |
+
return f"Invalid input. Expected features: {features}"
|
78 |
+
|
79 |
+
input_array = [float(x) for x in input_features]
|
80 |
+
prediction = predict_with_model(model, input_array)
|
81 |
+
return f"Prediction for {friendly_model_name}: {prediction}"
|
82 |
+
|
83 |
+
# Define Gradio interface
|
84 |
+
def gradio_interface():
|
85 |
+
def create_inputs_for_features(features):
|
86 |
+
inputs = []
|
87 |
+
for feature in features:
|
88 |
+
min_val, max_val = FEATURE_RANGES.get(feature, (None, None))
|
89 |
+
inputs.append(gr.Number(label=f"{feature} (Range: {min_val} - {max_val})", minimum=min_val, maximum=max_val))
|
90 |
+
return inputs
|
91 |
+
|
92 |
+
# Create a separate interface for each target variable
|
93 |
+
interfaces = []
|
94 |
+
for target, features in FEATURES.items():
|
95 |
+
inputs = create_inputs_for_features(features)
|
96 |
+
interface = gr.Interface(
|
97 |
+
fn=lambda *args, target=target: make_prediction(args, target),
|
98 |
+
inputs=inputs,
|
99 |
+
outputs=gr.Text(label="Prediction Result"),
|
100 |
+
title=f"Prediction for {target}",
|
101 |
+
description=f"Provide values for features relevant to {target}"
|
102 |
+
)
|
103 |
+
interfaces.append(interface)
|
104 |
+
|
105 |
+
# Combine all interfaces into a tabbed layout
|
106 |
+
tabbed_interface = gr.TabbedInterface(
|
107 |
+
interface_list=interfaces,
|
108 |
+
tab_names=list(FEATURES.keys())
|
109 |
+
)
|
110 |
+
return tabbed_interface
|
111 |
+
|
112 |
+
# Launch Gradio app
|
113 |
+
if __name__ == "__main__":
|
114 |
+
interface = gradio_interface()
|
115 |
+
interface.launch()
|
scripts/fibropred_model.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import os
|
4 |
+
import joblib
|
5 |
+
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
|
6 |
+
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
7 |
+
from sklearn.impute import SimpleImputer
|
8 |
+
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc
|
9 |
+
from sklearn.feature_selection import SelectFromModel
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
import seaborn as sns
|
12 |
+
|
13 |
+
# Load dataset
|
14 |
+
def load_data(file_path):
|
15 |
+
df = pd.read_excel(file_path, header=1)
|
16 |
+
return df
|
17 |
+
|
18 |
+
# Preprocess data including categorical variables
|
19 |
+
def preprocess_data_with_categoricals(df):
|
20 |
+
# Replace -9 with NaN for missing values
|
21 |
+
df.replace(-9, np.nan, inplace=True)
|
22 |
+
|
23 |
+
# Drop columns with >50% missing values
|
24 |
+
missing_percentage = df.isnull().sum() / len(df) * 100
|
25 |
+
df = df.drop(columns=missing_percentage[missing_percentage > 50].index)
|
26 |
+
|
27 |
+
# Drop specific columns
|
28 |
+
drop_columns = ['ProgressiveDisease', 'Final diagnosis', 'Transplantation date', 'Cause of death', 'Date of death', 'COD NUMBER']
|
29 |
+
df = df.drop(columns=[col for col in drop_columns if col in df.columns])
|
30 |
+
|
31 |
+
# Impute missing values
|
32 |
+
imputer = SimpleImputer(strategy='median')
|
33 |
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
34 |
+
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
|
35 |
+
|
36 |
+
# Handle binary variables specifically
|
37 |
+
if 'Binary diagnosis' in df.columns:
|
38 |
+
df['Binary diagnosis'] = df['Binary diagnosis'].apply(
|
39 |
+
lambda x: 1 if str(x).strip().lower() == "ipf" else 0
|
40 |
+
)
|
41 |
+
|
42 |
+
if 'Death' in df.columns:
|
43 |
+
df['Death'] = df['Death'].apply(
|
44 |
+
lambda x: 1 if str(x).strip().lower() == "yes" else 0
|
45 |
+
)
|
46 |
+
|
47 |
+
# Apply one-hot encoding to categorical variables
|
48 |
+
df = apply_one_hot_encoding(df)
|
49 |
+
|
50 |
+
# Separate categorical and numerical variables
|
51 |
+
categorical_cols = df.select_dtypes(include=['object']).columns
|
52 |
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
53 |
+
print("Categorical Variables:", categorical_cols.tolist())
|
54 |
+
print("Numerical Variables:", numeric_cols.tolist())
|
55 |
+
return df, numeric_cols, categorical_cols
|
56 |
+
|
57 |
+
# Apply one-hot encoding to categorical variables
|
58 |
+
def apply_one_hot_encoding(df):
|
59 |
+
categorical_cols = df.select_dtypes(include=['object']).columns
|
60 |
+
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
|
61 |
+
return df
|
62 |
+
|
63 |
+
# Select predictors using feature importance
|
64 |
+
def select_important_features(X, y, threshold=0.03):
|
65 |
+
model = RandomForestClassifier(random_state=42)
|
66 |
+
model.fit(X, y)
|
67 |
+
selector = SelectFromModel(model, threshold=threshold, prefit=True)
|
68 |
+
X_reduced = selector.transform(X)
|
69 |
+
selected_features = X.columns[selector.get_support()]
|
70 |
+
return pd.DataFrame(X_reduced, columns=selected_features), selected_features
|
71 |
+
|
72 |
+
# Visualize feature importance
|
73 |
+
def plot_feature_importance(model, features, target):
|
74 |
+
importance = model.feature_importances_
|
75 |
+
sorted_idx = np.argsort(importance)[::-1]
|
76 |
+
plt.figure(figsize=(10, 6))
|
77 |
+
sns.barplot(x=importance[sorted_idx], y=np.array(features)[sorted_idx])
|
78 |
+
plt.title(f'Feature Importance for {target}')
|
79 |
+
plt.xlabel('Importance')
|
80 |
+
plt.ylabel('Feature')
|
81 |
+
plt.tight_layout()
|
82 |
+
plt.show()
|
83 |
+
|
84 |
+
# Visualize overfitting and optimization results
|
85 |
+
def plot_model_performance(cv_scores, train_scores, test_scores, target ,metric_name="Accuracy"):
|
86 |
+
plt.figure(figsize=(12, 6))
|
87 |
+
|
88 |
+
# Cross-validation scores
|
89 |
+
plt.subplot(1, 2, 1)
|
90 |
+
plt.plot(cv_scores, label='Cross-validation scores', marker='o')
|
91 |
+
plt.title(f'Cross-validation {metric_name} for {target}')
|
92 |
+
plt.xlabel('Fold')
|
93 |
+
plt.ylabel(metric_name)
|
94 |
+
plt.grid(True)
|
95 |
+
plt.legend()
|
96 |
+
|
97 |
+
# Train vs Test comparison
|
98 |
+
plt.subplot(1, 2, 2)
|
99 |
+
plt.bar(['Train', 'Test'], [train_scores.mean(), test_scores], color=['blue', 'orange'])
|
100 |
+
plt.title(f'{metric_name}: Train vs Test')
|
101 |
+
plt.ylabel(metric_name)
|
102 |
+
plt.grid(True)
|
103 |
+
|
104 |
+
plt.tight_layout()
|
105 |
+
plt.show()
|
106 |
+
|
107 |
+
# Plot ROC-AUC curve
|
108 |
+
def plot_roc_auc(model, X_test, y_test, target):
|
109 |
+
y_prob = model.predict_proba(X_test)[:, 1] # Probabilidades para la clase positiva
|
110 |
+
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
|
111 |
+
roc_auc = auc(fpr, tpr)
|
112 |
+
|
113 |
+
plt.figure(figsize=(8, 6))
|
114 |
+
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
|
115 |
+
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
|
116 |
+
plt.xlabel('False Positive Rate')
|
117 |
+
plt.ylabel('True Positive Rate')
|
118 |
+
plt.title(f'ROC-AUC Curve for {target}')
|
119 |
+
plt.legend(loc="lower right")
|
120 |
+
plt.grid(True)
|
121 |
+
plt.show()
|
122 |
+
|
123 |
+
# Save trained model
|
124 |
+
def save_model(model, target, selected_features):
|
125 |
+
|
126 |
+
if not os.path.exists("models"):
|
127 |
+
os.makedirs("models")
|
128 |
+
file_name = f"models/{target}_random_forest_model.pkl"
|
129 |
+
joblib.dump({'model': model, 'features': selected_features}, file_name)
|
130 |
+
print(f"Model and features saved to {file_name}")
|
131 |
+
|
132 |
+
|
133 |
+
# Main pipeline
|
134 |
+
def main():
|
135 |
+
file_path = 'FibroPredCODIFICADA.xlsx'
|
136 |
+
df = load_data(file_path)
|
137 |
+
|
138 |
+
# Target columns
|
139 |
+
target_columns = ['Death', 'Progressive disease', 'Necessity of transplantation']
|
140 |
+
|
141 |
+
# Preprocess data
|
142 |
+
df, numeric_cols, categorical_cols = preprocess_data_with_categoricals(df)
|
143 |
+
|
144 |
+
for target in target_columns:
|
145 |
+
print(f"Processing target: {target}")
|
146 |
+
X = df[numeric_cols].drop(columns=target_columns, errors='ignore') # Ensure target variables are excluded
|
147 |
+
y = df[target]
|
148 |
+
|
149 |
+
# Split data
|
150 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
151 |
+
|
152 |
+
# Select important features
|
153 |
+
X_train_selected, selected_features = select_important_features(X_train, y_train)
|
154 |
+
X_test_selected = X_test[selected_features]
|
155 |
+
|
156 |
+
print(f"Selected predictors for training {target} ({len(selected_features)} predictors): {selected_features.tolist()}")
|
157 |
+
|
158 |
+
# Train RandomForest model
|
159 |
+
model = RandomForestClassifier(n_estimators=300,
|
160 |
+
max_depth=4,
|
161 |
+
min_samples_split=10,
|
162 |
+
min_samples_leaf=10,
|
163 |
+
class_weight='balanced',
|
164 |
+
max_features='sqrt',
|
165 |
+
random_state=42)
|
166 |
+
model.fit(X_train_selected, y_train)
|
167 |
+
|
168 |
+
# Cross-validation to check overfitting
|
169 |
+
cv = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)
|
170 |
+
cv_scores = cross_val_score(model, X_train_selected, y_train, cv=cv, scoring='accuracy')
|
171 |
+
train_scores = cross_val_score(model, X_train_selected, y_train, cv=15, scoring='accuracy')
|
172 |
+
y_pred_test = model.predict(X_test_selected)
|
173 |
+
test_score = accuracy_score(y_test, y_pred_test)
|
174 |
+
|
175 |
+
print(f"Cross-validation accuracy for {target}: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
|
176 |
+
print(f"Test accuracy for {target}: {test_score:.4f}")
|
177 |
+
print(classification_report(y_test, y_pred_test))
|
178 |
+
|
179 |
+
# Plot model performance
|
180 |
+
plot_model_performance(cv_scores, train_scores, test_score, target, metric_name="Accuracy")
|
181 |
+
|
182 |
+
# Plot feature importance
|
183 |
+
print(f"Feature importance for {target}:")
|
184 |
+
plot_feature_importance(model, selected_features, target)
|
185 |
+
|
186 |
+
# Plot ROC-AUC Curve
|
187 |
+
plot_roc_auc(model, X_test_selected, y_test, target)
|
188 |
+
|
189 |
+
# Save trained model
|
190 |
+
save_model(model, target, selected_features.tolist())
|
191 |
+
|
192 |
+
print("Pipeline completed.")
|
193 |
+
|
194 |
+
if __name__ == "__main__":
|
195 |
+
main()
|
scripts/model_utils.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import joblib
|
3 |
+
|
4 |
+
def load_all_models(models_dir="models"):
|
5 |
+
"""
|
6 |
+
Load all models and their features from the given directory.
|
7 |
+
"""
|
8 |
+
models = {}
|
9 |
+
features = {}
|
10 |
+
if not os.path.exists(models_dir):
|
11 |
+
raise FileNotFoundError(f"Models directory '{models_dir}' not found.")
|
12 |
+
|
13 |
+
for model_file in os.listdir(models_dir):
|
14 |
+
if model_file.endswith(".pkl"):
|
15 |
+
model_name = os.path.splitext(model_file)[0]
|
16 |
+
data = joblib.load(os.path.join(models_dir, model_file))
|
17 |
+
models[model_name] = data['model']
|
18 |
+
features[model_name] = data['features']
|
19 |
+
print(f"Model '{model_name}' loaded successfully with features: {features[model_name]}")
|
20 |
+
return models, features
|
21 |
+
|
22 |
+
def predict_with_model(model, input_data):
|
23 |
+
"""
|
24 |
+
Predict using a loaded model.
|
25 |
+
|
26 |
+
Parameters:
|
27 |
+
- model: The loaded model.
|
28 |
+
- input_data: A dictionary or Pandas DataFrame row containing input features.
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
- prediction: Model prediction.
|
32 |
+
"""
|
33 |
+
prediction = model.predict([input_data])
|
34 |
+
return int(prediction[0])
|