elli-teu
commited on
Commit
·
5184bfe
1
Parent(s):
7762bc9
Removed duplicate trips
Browse files
app.py
CHANGED
@@ -74,6 +74,48 @@ def get_buses():
|
|
74 |
short_bus_list = list(pd.unique(bus_df["route_short_name"]))
|
75 |
return bus_df, bus_list, short_bus_list
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
def plot_graph(plot_df):
|
78 |
#Nu vill vi plotta!
|
79 |
categories = {0 : 'Empty',
|
@@ -154,6 +196,9 @@ def main():
|
|
154 |
if is_local_data_valid():
|
155 |
st.write("Using cached local data.")
|
156 |
st.session_state.data = load_local_data("data.csv")
|
|
|
|
|
|
|
157 |
else:
|
158 |
# Fetch data if local data is invalid
|
159 |
if st.session_state.hopsworks_project is None:
|
|
|
74 |
short_bus_list = list(pd.unique(bus_df["route_short_name"]))
|
75 |
return bus_df, bus_list, short_bus_list
|
76 |
|
77 |
+
|
78 |
+
def remove_duplicate_trips(df, route_id_col="route_id", trip_id_col = "trip_id", stop_id_col = "stop_name", datetime_col = "datetime", time_window='3min'):
|
79 |
+
"""
|
80 |
+
Removes duplicate trips based on route_id, start stop_id, and starting time proximity within a time window.
|
81 |
+
|
82 |
+
Parameters:
|
83 |
+
df (pd.DataFrame): Input DataFrame containing trip data.
|
84 |
+
route_id_col (str): Column name for route IDs.
|
85 |
+
trip_id_col (str): Column name for trip IDs.
|
86 |
+
stop_id_col (str): Column name for stop IDs.
|
87 |
+
datetime_col (str): Column name for departure times.
|
88 |
+
time_window (str): Time window for considering trips as duplicates (e.g., '3min').
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
pd.DataFrame: Filtered DataFrame with duplicates removed.
|
92 |
+
"""
|
93 |
+
# Ensure the datetime column is of datetime type
|
94 |
+
df[datetime_col] = pd.to_datetime(df[datetime_col])
|
95 |
+
|
96 |
+
# Sort by route_id, stop_id, and datetime for proper grouping and filtering
|
97 |
+
df = df.sort_values(by=[route_id_col, stop_id_col, datetime_col])
|
98 |
+
|
99 |
+
# Find the first stop for each trip
|
100 |
+
first_stops = df.groupby(trip_id_col).first().reset_index()
|
101 |
+
|
102 |
+
# Identify duplicate trips based on route_id, stop_id, and datetime proximity
|
103 |
+
def filter_duplicates(group):
|
104 |
+
# Compare trips starting within the time window
|
105 |
+
group['keep'] = ~((group[stop_id_col] == group[stop_id_col].shift()) &
|
106 |
+
(group[datetime_col] - group[datetime_col].shift() <= pd.Timedelta(time_window)))
|
107 |
+
group['keep'] = group['keep'].cumsum() == 1 # Keep only the first trip in each duplicate group
|
108 |
+
return group[group['keep']]
|
109 |
+
|
110 |
+
# Apply filtering for each route_id group
|
111 |
+
filtered_first_stops = first_stops.groupby(route_id_col, group_keys=False).apply(filter_duplicates)
|
112 |
+
|
113 |
+
# Filter the original DataFrame to retain only the non-duplicate trips
|
114 |
+
unique_trip_ids = filtered_first_stops[trip_id_col].unique()
|
115 |
+
result = df[df[trip_id_col].isin(unique_trip_ids)]
|
116 |
+
|
117 |
+
return result
|
118 |
+
|
119 |
def plot_graph(plot_df):
|
120 |
#Nu vill vi plotta!
|
121 |
categories = {0 : 'Empty',
|
|
|
196 |
if is_local_data_valid():
|
197 |
st.write("Using cached local data.")
|
198 |
st.session_state.data = load_local_data("data.csv")
|
199 |
+
if "first" not in st.session_state:
|
200 |
+
st.session_state.first = True
|
201 |
+
st.session_state.data = remove_duplicate_trips(st.session_state.data)
|
202 |
else:
|
203 |
# Fetch data if local data is invalid
|
204 |
if st.session_state.hopsworks_project is None:
|