Spaces:

therayz1
/

Exploratory_Data_Analysis

Sleeping

App Files Files Community

therayz1 commited on Oct 29, 2024

Commit

334bfd3

verified ·

1 Parent(s): ca65591

app.py

Browse files

Files changed (1) hide show

app.py +487 -0

app.py ADDED Viewed

	@@ -0,0 +1,487 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
+from sklearn.impute import KNNImputer
+from scipy import stats
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import warnings
+import io
+import base64
+from datetime import datetime
+import json
+import statsmodels.api as sm
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+from scipy.stats import chi2_contingency
+warnings.filterwarnings('ignore')
+class DataAnalyzer:
+    def __init__(self):
+        self.df = None
+        self.numeric_columns = None
+        self.categorical_columns = None
+    def load_data(self, file):
+        try:
+            self.df = pd.read_csv(file.name)
+            self._identify_column_types()
+            return "Veri başarıyla yüklendi!"
+        except Exception as e:
+            return f"Hata: {str(e)}"
+    def _identify_column_types(self):
+        self.numeric_columns = self.df.select_dtypes(include=[np.number]).columns
+        self.categorical_columns = self.df.select_dtypes(include=['object']).columns
+    def get_basic_info(self):
+        if self.df is None:
+            return "Önce veri yükleyin!"
+        info = []
+        info.append("### 1. Temel Veri Bilgileri")
+        info.append(f"Satır Sayısı: {self.df.shape[0]}")
+        info.append(f"Sütun Sayısı: {self.df.shape[1]}")
+        memory_usage = self.df.memory_usage(deep=True).sum()
+        info.append(f"Bellek Kullanımı: {memory_usage / 1024:.2f} KB")
+        # Veri tipleri
+        info.append("\n### 2. Veri Tipleri ve Örnekler")
+        for column in self.df.columns:
+            unique_count = self.df[column].nunique()
+            info.append(f"\n{column}:")
+            info.append(f"  - Tip: {self.df[column].dtype}")
+            info.append(f"  - Benzersiz Değer Sayısı: {unique_count}")
+            info.append(f"  - İlk 3 Örnek: {', '.join(map(str, self.df[column].head(3)))}")
+        return "\n".join(info)
+    def analyze_missing_values(self):
+        if self.df is None:
+            return "Önce veri yükleyin!"
+        missing = pd.DataFrame({
+            'Eksik Sayı': self.df.isnull().sum(),
+            'Eksik Yüzde': (self.df.isnull().sum() / len(self.df) * 100).round(2)
+        })
+        # Eksik değer pattern analizi
+        missing_patterns = self.df.isnull().value_counts().head()
+        result = "### Eksik Değer Analizi\n\n"
+        result += missing.to_string()
+        result += "\n\n### Eksik Değer Örüntüleri (İlk 5)\n\n"
+        result += missing_patterns.to_string()
+        return result
+    def analyze_outliers(self, method='zscore', threshold=3):
+        if self.df is None:
+            return "Önce veri yükleyin!"
+        results = []
+        results.append("### Aykırı Değer Analizi\n")
+        for column in self.numeric_columns:
+            results.append(f"\n{column} analizi:")
+            if method == 'zscore':
+                z_scores = np.abs(stats.zscore(self.df[column].dropna()))
+                outliers = np.where(z_scores > threshold)[0]
+                results.append(f"Z-score metodu ile {len(outliers)} aykırı değer bulundu")
+                if len(outliers) > 0:
+                    results.append(f"Aykırı değerler: {self.df[column].iloc[outliers].values[:5]}...")
+            elif method == 'iqr':
+                Q1 = self.df[column].quantile(0.25)
+                Q3 = self.df[column].quantile(0.75)
+                IQR = Q3 - Q1
+                outliers = self.df[(self.df[column] < (Q1 - 1.5 * IQR)) |
+                                   (self.df[column] > (Q3 + 1.5 * IQR))][column]
+                results.append(f"IQR metodu ile {len(outliers)} aykırı değer bulundu")
+                if len(outliers) > 0:
+                    results.append(f"Aykırı değerler: {outliers.values[:5]}...")
+            # Temel istatistikler
+            stats_data = self.df[column].describe()
+            results.append("\nTemel İstatistikler:")
+            results.append(stats_data.to_string())
+        return "\n".join(results)
+    def analyze_correlations(self):
+        if self.df is None:
+            return "Önce veri yükleyin!"
+        # Sayısal değişkenler için korelasyon
+        numeric_corr = self.df[self.numeric_columns].corr()
+        # Kategorik değişkenler için Cramer's V
+        cat_correlations = []
+        for col1 in self.categorical_columns:
+            for col2 in self.categorical_columns:
+                if col1 < col2:
+                    contingency = pd.crosstab(self.df[col1], self.df[col2])
+                    chi2, _, _, _ = chi2_contingency(contingency)
+                    n = contingency.sum().sum()
+                    v = np.sqrt(chi2 / (n * min(contingency.shape[0] - 1, contingency.shape[1] - 1)))
+                    cat_correlations.append(f"{col1} - {col2}: {v:.3f}")
+        result = "### Sayısal Değişkenler Arası Korelasyonlar\n\n"
+        result += numeric_corr.round(3).to_string()
+        if cat_correlations:
+            result += "\n\n### Kategorik Değişkenler Arası İlişkiler (Cramer's V)\n\n"
+            result += "\n".join(cat_correlations)
+        return result
+    def create_visualization(self, plot_type, x_col, y_col=None, color_col=None):
+        if self.df is None:
+            return None
+        plt.figure(figsize=(10, 6))
+        try:
+            if plot_type == 'histogram':
+                fig = px.histogram(self.df, x=x_col, color=color_col,
+                                   title=f'{x_col} Histogram')
+            elif plot_type == 'box':
+                fig = px.box(self.df, x=x_col, y=y_col, color=color_col,
+                             title=f'{x_col} - {y_col} Box Plot')
+            elif plot_type == 'scatter':
+                fig = px.scatter(self.df, x=x_col, y=y_col, color=color_col,
+                                 title=f'{x_col} vs {y_col} Scatter Plot')
+            elif plot_type == 'bar':
+                fig = px.bar(self.df, x=x_col, y=y_col, color=color_col,
+                             title=f'{x_col} - {y_col} Bar Plot')
+            elif plot_type == 'violin':
+                fig = px.violin(self.df, x=x_col, y=y_col, color=color_col,
+                                title=f'{x_col} - {y_col} Violin Plot')
+            elif plot_type == 'line':
+                fig = px.line(self.df, x=x_col, y=y_col, color=color_col,
+                              title=f'{x_col} - {y_col} Line Plot')
+            elif plot_type == 'heatmap':
+                corr = self.df[[x_col, y_col]].corr()
+                fig = px.imshow(corr, title='Correlation Heatmap')
+            return fig
+        except Exception as e:
+            return f"Görselleştirme oluşturulurken hata: {str(e)}"
+    def feature_importance(self, target_col):
+        if self.df is None:
+            return "Önce veri yükleyin!"
+        try:
+            # Sayısal değişkenler için VIF hesaplama
+            X = self.df[self.numeric_columns].drop(columns=[target_col], errors='ignore')
+            vif_data = pd.DataFrame()
+            vif_data["Feature"] = X.columns
+            vif_data["VIF"] = [variance_inflation_factor(X.values, i)
+                               for i in range(X.shape[1])]
+            result = "### Özellik Önem Analizi\n\n"
+            result += "VIF (Variance Inflation Factor) Değerleri:\n"
+            result += vif_data.sort_values('VIF', ascending=False).to_string()
+            # Korelasyon bazlı özellik önemi
+            if target_col in self.df.columns:
+                correlations = self.df[self.numeric_columns].corrwith(self.df[target_col])
+                result += "\n\nHedef Değişken ile Korelasyonlar:\n"
+                result += correlations.sort_values(ascending=False).to_string()
+            return result
+        except Exception as e:
+            return f"Özellik önem analizi sırasında hata: {str(e)}"
+    def statistical_tests(self, column1, column2=None):
+        if self.df is None:
+            return "Önce veri yükleyin!"
+        results = []
+        results.append("### İstatistiksel Test Sonuçları\n")
+        try:
+            # Tek değişkenli testler
+            if column2 is None:
+                # Normallik testi
+                stat, p_value = stats.normaltest(self.df[column1].dropna())
+                results.append(f"Normallik Testi (D'Agostino and Pearson's):")
+                results.append(f"Stat: {stat:.4f}, p-value: {p_value:.4f}")
+                results.append(f"Sonuç: {'Normal dağılım' if p_value > 0.05 else 'Normal dağılım değil'}\n")
+                # Temel istatistikler
+                desc = self.df[column1].describe()
+                results.append("Temel İstatistikler:")
+                results.append(desc.to_string())
+            # İki değişkenli testler
+            else:
+                if column1 in self.numeric_columns and column2 in self.numeric_columns:
+                    # Pearson korelasyon
+                    corr, p_value = stats.pearsonr(self.df[column1].dropna(),
+                                                   self.df[column2].dropna())
+                    results.append(f"Pearson Korelasyon:")
+                    results.append(f"Correlation: {corr:.4f}, p-value: {p_value:.4f}\n")
+                    # T-test
+                    t_stat, p_value = stats.ttest_ind(self.df[column1].dropna(),
+                                                      self.df[column2].dropna())
+                    results.append(f"Bağımsız T-test:")
+                    results.append(f"T-stat: {t_stat:.4f}, p-value: {p_value:.4f}\n")
+                elif column1 in self.categorical_columns and column2 in self.categorical_columns:
+                    # Chi-square test
+                    contingency = pd.crosstab(self.df[column1], self.df[column2])
+                    chi2, p_value, dof, expected = chi2_contingency(contingency)
+                    results.append(f"Chi-square Bağımsızlık Testi:")
+                    results.append(f"Chi2: {chi2:.4f}, p-value: {p_value:.4f}")
+            return "\n".join(results)
+        except Exception as e:
+            return f"İstatistiksel testler sırasında hata: {str(e)}"
+def create_interface():
+    analyzer = DataAnalyzer()
+    with gr.Blocks() as demo:
+        gr.Markdown("# Gelişmiş Veri Analiz Aracı")
+        with gr.Tab("Veri Yükleme ve Temel Bilgiler"):
+            file_input = gr.File(label="CSV Dosyası Yükleyin")
+            load_button = gr.Button("Veri Yükle")
+            info_button = gr.Button("Temel Bilgileri Göster")
+            output_text = gr.Textbox(label="Sonuçlar", lines=20)
+            load_button.click(analyzer.load_data, inputs=[file_input], outputs=[output_text])
+            info_button.click(analyzer.get_basic_info, outputs=[output_text])
+        with gr.Tab("Eksik Değer Analizi"):
+            missing_button = gr.Button("Eksik Değerleri Analiz Et")
+            missing_output = gr.Textbox(label="Eksik Değer Analizi", lines=15)
+            missing_button.click(analyzer.analyze_missing_values, outputs=[missing_output])
+        with gr.Tab("Aykırı Değer Analizi"):
+            with gr.Row():
+                outlier_method = gr.Radio(["zscore", "iqr"], label="Analiz Metodu", value="zscore")
+                outlier_threshold = gr.Slider(minimum=1, maximum=5, value=3, label="Eşik Değeri")
+            outlier_button = gr.Button("Aykırı Değerleri Analiz Et")
+            outlier_output = gr.Textbox(label="Aykırı Değer Analizi", lines=15)
+            outlier_button.click(
+                analyzer.analyze_outliers,
+                inputs=[outlier_method, outlier_threshold],
+                outputs=[outlier_output]
+            )
+            with gr.Tab("Korelasyon Analizi"):
+                corr_button = gr.Button("Korelasyonları Analiz Et")
+                corr_output = gr.Textbox(label="Korelasyon Analizi", lines=15)
+                corr_button.click(analyzer.analyze_correlations, outputs=[corr_output])
+            with gr.Tab("Görselleştirme"):
+                with gr.Row():
+                    plot_type = gr.Dropdown(
+                        choices=[
+                            "histogram", "box", "scatter", "bar",
+                            "violin", "line", "heatmap"
+                        ],
+                        label="Grafik Tipi",
+                        value="histogram"
+                    )
+                    x_col = gr.Dropdown(label="X Ekseni")
+                    y_col = gr.Dropdown(label="Y Ekseni")
+                    color_col = gr.Dropdown(label="Renk Değişkeni (Opsiyonel)")
+                plot_button = gr.Button("Grafik Oluştur")
+                plot_output = gr.Plot(label="Görselleştirme")
+                def update_columns(file):
+                    if file is not None:
+                        df = pd.read_csv(file.name)
+                        return gr.Dropdown(choices=df.columns.tolist()), \
+                            gr.Dropdown(choices=df.columns.tolist()), \
+                            gr.Dropdown(choices=['None'] + df.columns.tolist())
+                    return gr.Dropdown(), gr.Dropdown(), gr.Dropdown()
+                file_input.change(
+                    update_columns,
+                    inputs=[file_input],
+                    outputs=[x_col, y_col, color_col]
+                )
+                plot_button.click(
+                    analyzer.create_visualization,
+                    inputs=[plot_type, x_col, y_col, color_col],
+                    outputs=[plot_output]
+                )
+            with gr.Tab("İstatistiksel Analizler"):
+                with gr.Row():
+                    stat_col1 = gr.Dropdown(label="Birinci Değişken")
+                    stat_col2 = gr.Dropdown(label="İkinci Değişken (Opsiyonel)")
+                stat_button = gr.Button("İstatistiksel Testleri Çalıştır")
+                stat_output = gr.Textbox(label="Test Sonuçları", lines=15)
+                file_input.change(
+                    lambda file: (
+                        gr.Dropdown(choices=pd.read_csv(file.name).columns.tolist()),
+                        gr.Dropdown(choices=['None'] + pd.read_csv(file.name).columns.tolist())
+                    ) if file else (gr.Dropdown(), gr.Dropdown()),
+                    inputs=[file_input],
+                    outputs=[stat_col1, stat_col2]
+                )
+                stat_button.click(
+                    analyzer.statistical_tests,
+                    inputs=[stat_col1, stat_col2],
+                    outputs=[stat_output]
+                )
+            with gr.Tab("Özellik Önem Analizi"):
+                target_col = gr.Dropdown(label="Hedef Değişken")
+                importance_button = gr.Button("Özellik Önemini Analiz Et")
+                importance_output = gr.Textbox(label="Özellik Önem Analizi", lines=15)
+                file_input.change(
+                    lambda file: gr.Dropdown(
+                        choices=pd.read_csv(file.name).columns.tolist()) if file else gr.Dropdown(),
+                    inputs=[file_input],
+                    outputs=[target_col]
+                )
+                importance_button.click(
+                    analyzer.feature_importance,
+                    inputs=[target_col],
+                    outputs=[importance_output]
+                )
+            with gr.Tab("Veri Ön İşleme"):
+                with gr.Row():
+                    preprocess_method = gr.Radio(
+                        choices=["standardization", "minmax", "robust", "log"],
+                        label="Ölçeklendirme Metodu",
+                        value="standardization"
+                    )
+                    columns_to_process = gr.Dropdown(
+                        label="İşlenecek Sütunlar",
+                        multiselect=True
+                    )
+                def preprocess_data(file, method, columns):
+                    if file is None:
+                        return "Önce veri yükleyin!"
+                    try:
+                        df = pd.read_csv(file.name)
+                        processed_df = df.copy()
+                        if method == "standardization":
+                            scaler = StandardScaler()
+                        elif method == "minmax":
+                            scaler = MinMaxScaler()
+                        elif method == "robust":
+                            scaler = RobustScaler()
+                        elif method == "log":
+                            for col in columns:
+                                processed_df[col] = np.log1p(df[col])
+                            return processed_df
+                        if method != "log":
+                            processed_df[columns] = scaler.fit_transform(df[columns])
+                        output_path = "preprocessed_data.csv"
+                        processed_df.to_csv(output_path, index=False)
+                        return output_path
+                    except Exception as e:
+                        return f"Ön işleme sırasında hata: {str(e)}"
+                preprocess_button = gr.Button("Ön İşleme Uygula")
+                preprocess_output = gr.File(label="İşlenmiş Veri")
+                file_input.change(
+                    lambda file: gr.Dropdown(
+                        choices=pd.read_csv(file.name).select_dtypes(include=[np.number]).columns.tolist(),
+                        multiselect=True
+                    ) if file else gr.Dropdown(),
+                    inputs=[file_input],
+                    outputs=[columns_to_process]
+                )
+                preprocess_button.click(
+                    preprocess_data,
+                    inputs=[file_input, preprocess_method, columns_to_process],
+                    outputs=[preprocess_output]
+                )
+            with gr.Tab("Rapor Oluşturma"):
+                report_button = gr.Button("Kapsamlı Rapor Oluştur")
+                def generate_report(file):
+                    if file is None:
+                        return "Önce veri yükleyin!"
+                    try:
+                        analyzer.load_data(file)
+                        report = []
+                        report.append("# Veri Analiz Raporu")
+                        report.append(f"Oluşturma Tarihi: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+                        report.append("## 1. Temel Bilgiler")
+                        report.append(analyzer.get_basic_info())
+                        report.append("\n## 2. Eksik Değer Analizi")
+                        report.append(analyzer.analyze_missing_values())
+                        report.append("\n## 3. Aykırı Değer Analizi")
+                        report.append(analyzer.analyze_outliers())
+                        report.append("\n## 4. Korelasyon Analizi")
+                        report.append(analyzer.analyze_correlations())
+                        # Raporu kaydet
+                        report_text = "\n".join(report)
+                        with open("data_analysis_report.txt", "w", encoding="utf-8") as f:
+                            f.write(report_text)
+                        return "data_analysis_report.txt"
+                    except Exception as e:
+                        return f"Rapor oluşturma sırasında hata: {str(e)}"
+                report_output = gr.File(label="Oluşturulan Rapor")
+                report_button.click(
+                    generate_report,
+                    inputs=[file_input],
+                    outputs=[report_output]
+                )
+        return demo
+    # Arayüzü oluştur ve başlat
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()