elli-teu commited on
Commit
5184bfe
·
1 Parent(s): 7762bc9

Removed duplicate trips

Browse files
Files changed (1) hide show
  1. app.py +45 -0
app.py CHANGED
@@ -74,6 +74,48 @@ def get_buses():
74
  short_bus_list = list(pd.unique(bus_df["route_short_name"]))
75
  return bus_df, bus_list, short_bus_list
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def plot_graph(plot_df):
78
  #Nu vill vi plotta!
79
  categories = {0 : 'Empty',
@@ -154,6 +196,9 @@ def main():
154
  if is_local_data_valid():
155
  st.write("Using cached local data.")
156
  st.session_state.data = load_local_data("data.csv")
 
 
 
157
  else:
158
  # Fetch data if local data is invalid
159
  if st.session_state.hopsworks_project is None:
 
74
  short_bus_list = list(pd.unique(bus_df["route_short_name"]))
75
  return bus_df, bus_list, short_bus_list
76
 
77
+
78
+ def remove_duplicate_trips(df, route_id_col="route_id", trip_id_col = "trip_id", stop_id_col = "stop_name", datetime_col = "datetime", time_window='3min'):
79
+ """
80
+ Removes duplicate trips based on route_id, start stop_id, and starting time proximity within a time window.
81
+
82
+ Parameters:
83
+ df (pd.DataFrame): Input DataFrame containing trip data.
84
+ route_id_col (str): Column name for route IDs.
85
+ trip_id_col (str): Column name for trip IDs.
86
+ stop_id_col (str): Column name for stop IDs.
87
+ datetime_col (str): Column name for departure times.
88
+ time_window (str): Time window for considering trips as duplicates (e.g., '3min').
89
+
90
+ Returns:
91
+ pd.DataFrame: Filtered DataFrame with duplicates removed.
92
+ """
93
+ # Ensure the datetime column is of datetime type
94
+ df[datetime_col] = pd.to_datetime(df[datetime_col])
95
+
96
+ # Sort by route_id, stop_id, and datetime for proper grouping and filtering
97
+ df = df.sort_values(by=[route_id_col, stop_id_col, datetime_col])
98
+
99
+ # Find the first stop for each trip
100
+ first_stops = df.groupby(trip_id_col).first().reset_index()
101
+
102
+ # Identify duplicate trips based on route_id, stop_id, and datetime proximity
103
+ def filter_duplicates(group):
104
+ # Compare trips starting within the time window
105
+ group['keep'] = ~((group[stop_id_col] == group[stop_id_col].shift()) &
106
+ (group[datetime_col] - group[datetime_col].shift() <= pd.Timedelta(time_window)))
107
+ group['keep'] = group['keep'].cumsum() == 1 # Keep only the first trip in each duplicate group
108
+ return group[group['keep']]
109
+
110
+ # Apply filtering for each route_id group
111
+ filtered_first_stops = first_stops.groupby(route_id_col, group_keys=False).apply(filter_duplicates)
112
+
113
+ # Filter the original DataFrame to retain only the non-duplicate trips
114
+ unique_trip_ids = filtered_first_stops[trip_id_col].unique()
115
+ result = df[df[trip_id_col].isin(unique_trip_ids)]
116
+
117
+ return result
118
+
119
  def plot_graph(plot_df):
120
  #Nu vill vi plotta!
121
  categories = {0 : 'Empty',
 
196
  if is_local_data_valid():
197
  st.write("Using cached local data.")
198
  st.session_state.data = load_local_data("data.csv")
199
+ if "first" not in st.session_state:
200
+ st.session_state.first = True
201
+ st.session_state.data = remove_duplicate_trips(st.session_state.data)
202
  else:
203
  # Fetch data if local data is invalid
204
  if st.session_state.hopsworks_project is None: