import os import pandas as pd # Load local data def load_local_data(): #filepath = os.path.join(current_dir, "test_data.csv") filepath = "WheelyFunTimes/test_data.csv" return pd.read_csv(filepath) """if os.path.exists(filepath): return pd.read_csv(filepath) else: return None""" def remove_near_duplicates(data): print(data["trip_id"].nunique()) result = [] data["datetime"] = pd.to_datetime(data["datetime"]) for _, group in data.groupby(['route_id', 'stop_name']): # Initialize a list to store rows that are not duplicates filtered_rows = [] last_row = None for idx, row in group.iterrows(): if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)): # Keep the row if it's the first or sufficiently far apart in time filtered_rows.append(row) last_row = row # Add filtered rows to the result result.extend(filtered_rows) filtered_df = pd.DataFrame(result) # Return the filtered dataframe print(filtered_df["trip_id"].nunique()) return filtered_df df = load_local_data() print(df.head(12)) df = remove_near_duplicates(df) print(df.head(12))