WheelyFunTimes / test.py
elli-teu
Test av borttagning av dubbletter, tillägg av hållplatser samt start på sortering
8769306
raw
history blame
1.28 kB
import os
import pandas as pd
# Load local data
def load_local_data():
#filepath = os.path.join(current_dir, "test_data.csv")
filepath = "WheelyFunTimes/test_data.csv"
return pd.read_csv(filepath)
"""if os.path.exists(filepath):
return pd.read_csv(filepath)
else:
return None"""
def remove_near_duplicates(data):
print(data["trip_id"].nunique())
result = []
data["datetime"] = pd.to_datetime(data["datetime"])
for _, group in data.groupby(['route_id', 'stop_name']):
# Initialize a list to store rows that are not duplicates
filtered_rows = []
last_row = None
for idx, row in group.iterrows():
if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)):
# Keep the row if it's the first or sufficiently far apart in time
filtered_rows.append(row)
last_row = row
# Add filtered rows to the result
result.extend(filtered_rows)
filtered_df = pd.DataFrame(result)
# Return the filtered dataframe
print(filtered_df["trip_id"].nunique())
return filtered_df
df = load_local_data()
print(df.head(12))
df = remove_near_duplicates(df)
print(df.head(12))