Spaces:

WheelyFunTimesTeam
/

WheelyFunTimes

Running

App Files Files Community

elli-teu commited on about 1 month ago

Commit

5184bfe

1 Parent(s): 7762bc9

Removed duplicate trips

Browse files

Files changed (1) hide show

app.py +45 -0

app.py CHANGED Viewed

@@ -74,6 +74,48 @@ def get_buses():
     short_bus_list = list(pd.unique(bus_df["route_short_name"]))
     return bus_df, bus_list, short_bus_list
 def plot_graph(plot_df):
     #Nu vill vi plotta!
     categories =  {0 : 'Empty',
@@ -154,6 +196,9 @@ def main():
     if is_local_data_valid():
         st.write("Using cached local data.")
         st.session_state.data = load_local_data("data.csv")
     else:
         # Fetch data if local data is invalid
         if st.session_state.hopsworks_project is None:

     short_bus_list = list(pd.unique(bus_df["route_short_name"]))
     return bus_df, bus_list, short_bus_list
+def remove_duplicate_trips(df, route_id_col="route_id", trip_id_col = "trip_id", stop_id_col = "stop_name", datetime_col = "datetime", time_window='3min'):
+    """
+    Removes duplicate trips based on route_id, start stop_id, and starting time proximity within a time window.
+    Parameters:
+        df (pd.DataFrame): Input DataFrame containing trip data.
+        route_id_col (str): Column name for route IDs.
+        trip_id_col (str): Column name for trip IDs.
+        stop_id_col (str): Column name for stop IDs.
+        datetime_col (str): Column name for departure times.
+        time_window (str): Time window for considering trips as duplicates (e.g., '3min').
+    Returns:
+        pd.DataFrame: Filtered DataFrame with duplicates removed.
+    """
+    # Ensure the datetime column is of datetime type
+    df[datetime_col] = pd.to_datetime(df[datetime_col])
+    # Sort by route_id, stop_id, and datetime for proper grouping and filtering
+    df = df.sort_values(by=[route_id_col, stop_id_col, datetime_col])
+    # Find the first stop for each trip
+    first_stops = df.groupby(trip_id_col).first().reset_index()
+    # Identify duplicate trips based on route_id, stop_id, and datetime proximity
+    def filter_duplicates(group):
+        # Compare trips starting within the time window
+        group['keep'] = ~((group[stop_id_col] == group[stop_id_col].shift()) &
+                          (group[datetime_col] - group[datetime_col].shift() <= pd.Timedelta(time_window)))
+        group['keep'] = group['keep'].cumsum() == 1  # Keep only the first trip in each duplicate group
+        return group[group['keep']]
+    # Apply filtering for each route_id group
+    filtered_first_stops = first_stops.groupby(route_id_col, group_keys=False).apply(filter_duplicates)
+    # Filter the original DataFrame to retain only the non-duplicate trips
+    unique_trip_ids = filtered_first_stops[trip_id_col].unique()
+    result = df[df[trip_id_col].isin(unique_trip_ids)]
+    return result
 def plot_graph(plot_df):
     #Nu vill vi plotta!
     categories =  {0 : 'Empty',
     if is_local_data_valid():
         st.write("Using cached local data.")
         st.session_state.data = load_local_data("data.csv")
+        if "first" not in st.session_state:
+            st.session_state.first = True
+            st.session_state.data = remove_duplicate_trips(st.session_state.data)
     else:
         # Fetch data if local data is invalid
         if st.session_state.hopsworks_project is None: