amonfortc commited on
Commit
6a9ff1a
·
verified ·
1 Parent(s): fec3b4a

Upload 3 files

Browse files
Files changed (3) hide show
  1. scripts/app.py +115 -0
  2. scripts/fibropred_model.py +195 -0
  3. scripts/model_utils.py +34 -0
scripts/app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from model_utils import load_all_models, predict_with_model
3
+
4
+ # Load all models
5
+ models, model_features = load_all_models()
6
+
7
+ # Mapeo de nombres amigables a nombres reales
8
+ MODEL_MAPPING = {
9
+ "Death": "Death_random_forest_model",
10
+ "Binary diagnosis": "Binary diagnosis_random_forest_model",
11
+ "Necessity of transplantation": "Necessity of transplantation_random_forest_model",
12
+ "Progressive disease": "Progressive disease_random_forest_model"
13
+ }
14
+
15
+ # Invertir el mapeo (opcional para facilidad)
16
+ INVERSE_MODEL_MAPPING = {v: k for k, v in MODEL_MAPPING.items()}
17
+
18
+ # Feature sets for each target variable
19
+ FEATURES = {
20
+ "Death": [
21
+ 'Pedigree', 'Age at diagnosis', 'FVC (L) at diagnosis',
22
+ 'FVC (%) at diagnosis', 'DLCO (%) at diagnosis', 'RadioWorsening2y',
23
+ 'Severity of telomere shortening - Transform 4', 'Progressive disease'
24
+ ],
25
+ "Binary diagnosis": [
26
+ 'Pedigree', 'Age at diagnosis', 'Antifibrotic Drug',
27
+ 'Prednisone', 'Mycophenolate', 'FVC (L) at diagnosis',
28
+ 'FVC (%) at diagnosis', 'DLCO (%) at diagnosis'
29
+ ],
30
+ "Necessity of transplantation": [
31
+ 'Pedigree','Age at diagnosis','FVC (L) at diagnosis', 'FVC (%) at diagnosis', 'DLCO (%) at diagnosis',
32
+ 'FVC (L) 1 year after diagnosis','FVC (%) 1 year after diagnosis','DLCO (%) 1 year after diagnosis',
33
+ 'RadioWorsening2y'
34
+ ],
35
+ "Progressive disease": [
36
+ 'Pedigree', 'Age at diagnosis', 'FVC (L) at diagnosis','FVC (%) at diagnosis', 'DLCO (%) at diagnosis','FVC (L) 1 year after diagnosis',
37
+ 'FVC (%) 1 year after diagnosis', 'DLCO (%) 1 year after diagnosis',
38
+ 'RadioWorsening2y', 'Genetic mutation studied in patient'
39
+ ]
40
+
41
+ }
42
+
43
+ FEATURE_RANGES = {
44
+ 'Pedigree': (0, 67),
45
+ 'Age at diagnosis': (0, 200),
46
+ 'FVC (L) at diagnosis': (0.0, 5.0),
47
+ 'FVC (%) at diagnosis': (0.0, 200.0),
48
+ 'DLCO (%) at diagnosis': (0.0, 200.0),
49
+ 'RadioWorsening2y': (0, 3),
50
+ 'Severity of telomere shortening - Transform 4': (1, 6),
51
+ 'Progressive disease': (0, 1),
52
+ 'Antifibrotic Drug': (0, 1),
53
+ 'Prednisone': (0, 1),
54
+ 'Mycophenolate': (0, 1),
55
+ 'FVC (L) 1 year after diagnosis': (0.0, 5.0),
56
+ 'FVC (%) 1 year after diagnosis': (0.0, 200.0),
57
+ 'DLCO (%) 1 year after diagnosis': (0.0, 200.0),
58
+ 'Genetic mutation studied in patient': (0, 1),
59
+ 'Comorbidities': (0, 1)
60
+ }
61
+
62
+
63
+ # Define prediction function
64
+ def make_prediction(input_features, friendly_model_name):
65
+ """
66
+ Predict using the selected model and input features.
67
+ """
68
+ # Map the friendly model name to the real model name
69
+ target_model = MODEL_MAPPING.get(friendly_model_name)
70
+ if target_model not in models:
71
+ return f"Model '{friendly_model_name}' not found. Please select a valid model."
72
+
73
+ model = models[target_model]
74
+ features = model_features[target_model]
75
+
76
+ if len(input_features) != len(features):
77
+ return f"Invalid input. Expected features: {features}"
78
+
79
+ input_array = [float(x) for x in input_features]
80
+ prediction = predict_with_model(model, input_array)
81
+ return f"Prediction for {friendly_model_name}: {prediction}"
82
+
83
+ # Define Gradio interface
84
+ def gradio_interface():
85
+ def create_inputs_for_features(features):
86
+ inputs = []
87
+ for feature in features:
88
+ min_val, max_val = FEATURE_RANGES.get(feature, (None, None))
89
+ inputs.append(gr.Number(label=f"{feature} (Range: {min_val} - {max_val})", minimum=min_val, maximum=max_val))
90
+ return inputs
91
+
92
+ # Create a separate interface for each target variable
93
+ interfaces = []
94
+ for target, features in FEATURES.items():
95
+ inputs = create_inputs_for_features(features)
96
+ interface = gr.Interface(
97
+ fn=lambda *args, target=target: make_prediction(args, target),
98
+ inputs=inputs,
99
+ outputs=gr.Text(label="Prediction Result"),
100
+ title=f"Prediction for {target}",
101
+ description=f"Provide values for features relevant to {target}"
102
+ )
103
+ interfaces.append(interface)
104
+
105
+ # Combine all interfaces into a tabbed layout
106
+ tabbed_interface = gr.TabbedInterface(
107
+ interface_list=interfaces,
108
+ tab_names=list(FEATURES.keys())
109
+ )
110
+ return tabbed_interface
111
+
112
+ # Launch Gradio app
113
+ if __name__ == "__main__":
114
+ interface = gradio_interface()
115
+ interface.launch()
scripts/fibropred_model.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+ import joblib
5
+ from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
6
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
7
+ from sklearn.impute import SimpleImputer
8
+ from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc
9
+ from sklearn.feature_selection import SelectFromModel
10
+ import matplotlib.pyplot as plt
11
+ import seaborn as sns
12
+
13
+ # Load dataset
14
+ def load_data(file_path):
15
+ df = pd.read_excel(file_path, header=1)
16
+ return df
17
+
18
+ # Preprocess data including categorical variables
19
+ def preprocess_data_with_categoricals(df):
20
+ # Replace -9 with NaN for missing values
21
+ df.replace(-9, np.nan, inplace=True)
22
+
23
+ # Drop columns with >50% missing values
24
+ missing_percentage = df.isnull().sum() / len(df) * 100
25
+ df = df.drop(columns=missing_percentage[missing_percentage > 50].index)
26
+
27
+ # Drop specific columns
28
+ drop_columns = ['ProgressiveDisease', 'Final diagnosis', 'Transplantation date', 'Cause of death', 'Date of death', 'COD NUMBER']
29
+ df = df.drop(columns=[col for col in drop_columns if col in df.columns])
30
+
31
+ # Impute missing values
32
+ imputer = SimpleImputer(strategy='median')
33
+ numeric_cols = df.select_dtypes(include=['number']).columns
34
+ df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
35
+
36
+ # Handle binary variables specifically
37
+ if 'Binary diagnosis' in df.columns:
38
+ df['Binary diagnosis'] = df['Binary diagnosis'].apply(
39
+ lambda x: 1 if str(x).strip().lower() == "ipf" else 0
40
+ )
41
+
42
+ if 'Death' in df.columns:
43
+ df['Death'] = df['Death'].apply(
44
+ lambda x: 1 if str(x).strip().lower() == "yes" else 0
45
+ )
46
+
47
+ # Apply one-hot encoding to categorical variables
48
+ df = apply_one_hot_encoding(df)
49
+
50
+ # Separate categorical and numerical variables
51
+ categorical_cols = df.select_dtypes(include=['object']).columns
52
+ numeric_cols = df.select_dtypes(include=['number']).columns
53
+ print("Categorical Variables:", categorical_cols.tolist())
54
+ print("Numerical Variables:", numeric_cols.tolist())
55
+ return df, numeric_cols, categorical_cols
56
+
57
+ # Apply one-hot encoding to categorical variables
58
+ def apply_one_hot_encoding(df):
59
+ categorical_cols = df.select_dtypes(include=['object']).columns
60
+ df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
61
+ return df
62
+
63
+ # Select predictors using feature importance
64
+ def select_important_features(X, y, threshold=0.03):
65
+ model = RandomForestClassifier(random_state=42)
66
+ model.fit(X, y)
67
+ selector = SelectFromModel(model, threshold=threshold, prefit=True)
68
+ X_reduced = selector.transform(X)
69
+ selected_features = X.columns[selector.get_support()]
70
+ return pd.DataFrame(X_reduced, columns=selected_features), selected_features
71
+
72
+ # Visualize feature importance
73
+ def plot_feature_importance(model, features, target):
74
+ importance = model.feature_importances_
75
+ sorted_idx = np.argsort(importance)[::-1]
76
+ plt.figure(figsize=(10, 6))
77
+ sns.barplot(x=importance[sorted_idx], y=np.array(features)[sorted_idx])
78
+ plt.title(f'Feature Importance for {target}')
79
+ plt.xlabel('Importance')
80
+ plt.ylabel('Feature')
81
+ plt.tight_layout()
82
+ plt.show()
83
+
84
+ # Visualize overfitting and optimization results
85
+ def plot_model_performance(cv_scores, train_scores, test_scores, target ,metric_name="Accuracy"):
86
+ plt.figure(figsize=(12, 6))
87
+
88
+ # Cross-validation scores
89
+ plt.subplot(1, 2, 1)
90
+ plt.plot(cv_scores, label='Cross-validation scores', marker='o')
91
+ plt.title(f'Cross-validation {metric_name} for {target}')
92
+ plt.xlabel('Fold')
93
+ plt.ylabel(metric_name)
94
+ plt.grid(True)
95
+ plt.legend()
96
+
97
+ # Train vs Test comparison
98
+ plt.subplot(1, 2, 2)
99
+ plt.bar(['Train', 'Test'], [train_scores.mean(), test_scores], color=['blue', 'orange'])
100
+ plt.title(f'{metric_name}: Train vs Test')
101
+ plt.ylabel(metric_name)
102
+ plt.grid(True)
103
+
104
+ plt.tight_layout()
105
+ plt.show()
106
+
107
+ # Plot ROC-AUC curve
108
+ def plot_roc_auc(model, X_test, y_test, target):
109
+ y_prob = model.predict_proba(X_test)[:, 1] # Probabilidades para la clase positiva
110
+ fpr, tpr, thresholds = roc_curve(y_test, y_prob)
111
+ roc_auc = auc(fpr, tpr)
112
+
113
+ plt.figure(figsize=(8, 6))
114
+ plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
115
+ plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
116
+ plt.xlabel('False Positive Rate')
117
+ plt.ylabel('True Positive Rate')
118
+ plt.title(f'ROC-AUC Curve for {target}')
119
+ plt.legend(loc="lower right")
120
+ plt.grid(True)
121
+ plt.show()
122
+
123
+ # Save trained model
124
+ def save_model(model, target, selected_features):
125
+
126
+ if not os.path.exists("models"):
127
+ os.makedirs("models")
128
+ file_name = f"models/{target}_random_forest_model.pkl"
129
+ joblib.dump({'model': model, 'features': selected_features}, file_name)
130
+ print(f"Model and features saved to {file_name}")
131
+
132
+
133
+ # Main pipeline
134
+ def main():
135
+ file_path = 'FibroPredCODIFICADA.xlsx'
136
+ df = load_data(file_path)
137
+
138
+ # Target columns
139
+ target_columns = ['Death', 'Progressive disease', 'Necessity of transplantation']
140
+
141
+ # Preprocess data
142
+ df, numeric_cols, categorical_cols = preprocess_data_with_categoricals(df)
143
+
144
+ for target in target_columns:
145
+ print(f"Processing target: {target}")
146
+ X = df[numeric_cols].drop(columns=target_columns, errors='ignore') # Ensure target variables are excluded
147
+ y = df[target]
148
+
149
+ # Split data
150
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
151
+
152
+ # Select important features
153
+ X_train_selected, selected_features = select_important_features(X_train, y_train)
154
+ X_test_selected = X_test[selected_features]
155
+
156
+ print(f"Selected predictors for training {target} ({len(selected_features)} predictors): {selected_features.tolist()}")
157
+
158
+ # Train RandomForest model
159
+ model = RandomForestClassifier(n_estimators=300,
160
+ max_depth=4,
161
+ min_samples_split=10,
162
+ min_samples_leaf=10,
163
+ class_weight='balanced',
164
+ max_features='sqrt',
165
+ random_state=42)
166
+ model.fit(X_train_selected, y_train)
167
+
168
+ # Cross-validation to check overfitting
169
+ cv = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)
170
+ cv_scores = cross_val_score(model, X_train_selected, y_train, cv=cv, scoring='accuracy')
171
+ train_scores = cross_val_score(model, X_train_selected, y_train, cv=15, scoring='accuracy')
172
+ y_pred_test = model.predict(X_test_selected)
173
+ test_score = accuracy_score(y_test, y_pred_test)
174
+
175
+ print(f"Cross-validation accuracy for {target}: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
176
+ print(f"Test accuracy for {target}: {test_score:.4f}")
177
+ print(classification_report(y_test, y_pred_test))
178
+
179
+ # Plot model performance
180
+ plot_model_performance(cv_scores, train_scores, test_score, target, metric_name="Accuracy")
181
+
182
+ # Plot feature importance
183
+ print(f"Feature importance for {target}:")
184
+ plot_feature_importance(model, selected_features, target)
185
+
186
+ # Plot ROC-AUC Curve
187
+ plot_roc_auc(model, X_test_selected, y_test, target)
188
+
189
+ # Save trained model
190
+ save_model(model, target, selected_features.tolist())
191
+
192
+ print("Pipeline completed.")
193
+
194
+ if __name__ == "__main__":
195
+ main()
scripts/model_utils.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import joblib
3
+
4
+ def load_all_models(models_dir="models"):
5
+ """
6
+ Load all models and their features from the given directory.
7
+ """
8
+ models = {}
9
+ features = {}
10
+ if not os.path.exists(models_dir):
11
+ raise FileNotFoundError(f"Models directory '{models_dir}' not found.")
12
+
13
+ for model_file in os.listdir(models_dir):
14
+ if model_file.endswith(".pkl"):
15
+ model_name = os.path.splitext(model_file)[0]
16
+ data = joblib.load(os.path.join(models_dir, model_file))
17
+ models[model_name] = data['model']
18
+ features[model_name] = data['features']
19
+ print(f"Model '{model_name}' loaded successfully with features: {features[model_name]}")
20
+ return models, features
21
+
22
+ def predict_with_model(model, input_data):
23
+ """
24
+ Predict using a loaded model.
25
+
26
+ Parameters:
27
+ - model: The loaded model.
28
+ - input_data: A dictionary or Pandas DataFrame row containing input features.
29
+
30
+ Returns:
31
+ - prediction: Model prediction.
32
+ """
33
+ prediction = model.predict([input_data])
34
+ return int(prediction[0])