import streamlit as st import hopsworks import pandas as pd import os import time import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime, timedelta import altair as alt import api # Constants DATA_DIR = "data" TIMESTAMP_FILE = "last_download_time.txt" # Initialize Hopsworks connection def connect_to_hopsworks(): st.write("Connecting to Hopsworks...") project_name = "id2223AirQuality" HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY") print(f"HOPSWORKS_API_KEY {HOPSWORKS_API_KEY}") project = hopsworks.login(project="id2223AirQuality", api_key_value = os.getenv('HOPSWORKS_API_KEY')) return project # Fetch data from Hopsworks feature group def fetch_data_from_feature_group(project, feature_group_name, version): feature_store = project.get_feature_store() feature_group = feature_store.get_feature_group(name=feature_group_name, version=version) data = feature_group.read() return data # Save data locally def save_data_locally(data, filename): os.makedirs(DATA_DIR, exist_ok=True) filepath = os.path.join(DATA_DIR, filename) data.to_csv(filepath, index=False) # Save timestamp timestamp_path = os.path.join(DATA_DIR, TIMESTAMP_FILE) with open(timestamp_path, "w") as f: f.write(str(datetime.now())) return filepath # Load local data def load_local_data(filename): filepath = os.path.join(DATA_DIR, filename) if os.path.exists(filepath): return pd.read_csv(filepath) else: return None # Check if local data is valid def is_local_data_valid(): timestamp_path = os.path.join(DATA_DIR, TIMESTAMP_FILE) if not os.path.exists(timestamp_path): return False try: with open(timestamp_path, "r") as f: last_download_time = datetime.fromisoformat(f.read().strip()) # Check if the data is more than a day old if datetime.now() - last_download_time > timedelta(days=1): return False return True except Exception as e: st.warning(f"Error reading timestamp: {e}") return False def get_buses(): bus_df = st.session_state.data[["trip_id", "route_long_name", "route_short_name"]] bus_df = bus_df.drop_duplicates() bus_list = bus_df[["route_long_name", "route_short_name"]] bus_list = bus_list.drop_duplicates() short_bus_list = list(pd.unique(bus_df["route_short_name"])) return bus_df, bus_list, short_bus_list # Function to remove duplicates def remove_near_duplicates(data): print(data["trip_id"].nunique()) result = [] data["datetime"] = pd.to_datetime(data["datetime"]) for _, group in data.groupby(['route_id', 'stop_name']): # Initialize a list to store rows that are not duplicates filtered_rows = [] last_row = None for idx, row in group.iterrows(): if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)): # Keep the row if it's the first or sufficiently far apart in time filtered_rows.append(row) last_row = row # Add filtered rows to the result result.extend(filtered_rows) filtered_df = pd.DataFrame(result) # Return the filtered dataframe print(filtered_df["trip_id"].nunique()) return filtered_df def remove_duplicate_trips(df, route_id_col="route_id", trip_id_col="trip_id", datetime_col="datetime", time_window='3min'): """ Removes duplicate trips based on route_id and starting time proximity within a time window. Parameters: df (pd.DataFrame): Input DataFrame containing trip data. route_id_col (str): Column name for route IDs. trip_id_col (str): Column name for trip IDs. datetime_col (str): Column name for departure times. time_window (str): Time window for considering trips as duplicates (e.g., '3min'). Returns: pd.DataFrame: Filtered DataFrame with duplicates removed. """ print(df["trip_id"].nunique()) # Ensure the datetime column is of datetime type df[datetime_col] = pd.to_datetime(df[datetime_col]) # Sort by route_id and datetime for correct chronological order within each route df = df.sort_values(by=[route_id_col, datetime_col]) # Calculate time differences between consecutive rows within each route_id group df['time_diff'] = df.groupby(route_id_col)[datetime_col].diff().fillna(pd.Timedelta('0s')) # Mark rows as duplicates if the time difference is within the time window time_window_timedelta = pd.to_timedelta(time_window) df['is_duplicate'] = df['time_diff'] <= time_window_timedelta # Keep only the first row within each group of duplicates (based on time window) df['keep'] = ~df.groupby(route_id_col)['is_duplicate'].transform('any') # Filter rows: Keep only those that are marked as 'keep' result = df[df['keep']].drop(columns=['time_diff', 'is_duplicate', 'keep']) print(result["trip_id"].nunique()) return result def plot_graph(plot_df): #Nu vill vi plotta! #TODO ska den bara visa de stopp man vill eller alla? categories = {0 : 'Empty', 1: 'Many seats available', 2:'Few seats available', 3:'Standing room only', 4:'Crushed standing room', 5: 'Full'} plot_df = plot_df[["datetime", "vehicle_occupancystatus", "stop_name", "route_id"]] plot_df = plot_df.sort_values("datetime") st.write(plot_df.head()) st.write(plot_df.tail()) #plot_df = plot_df.set_index("datetime") plot_df["Occupancy"] = plot_df["vehicle_occupancystatus"].map(categories) # Explicitly set the order for Y_category category_order = list(categories.values()) # ['Empty', 'Many seats available', ..., 'Full'] category_order.reverse() #st.line_chart(plot_df) # Create the Altair chart chart = alt.Chart(plot_df).mark_line(point=True, interpolate="step-after").encode( x=alt.X('stop_name:N', title="Stop name"), # Use column name as string y=alt.Y('Occupancy:N', title="Vehicle Occupancy Status (Categories)", sort=category_order, scale=alt.Scale(domain=category_order)), # Treat Y as categorical tooltip=["datetime", 'stop_name', 'Occupancy'] # Add tooltips for interactivity ).properties( title="Vehicle Occupancy Status Over Time" ) st.altair_chart(chart, use_container_width=True) def plot_graph_title(plot_df, stop, time): #Nu vill vi plotta! #TODO ska den bara visa de stopp man vill eller alla? categories = {0 : 'Empty', 1: 'Many seats available', 2:'Few seats available', 3:'Standing room only', 4:'Crushed standing room', 5: 'Full'} plot_df = plot_df[["datetime", "vehicle_occupancystatus", "stop_name", "route_id"]] plot_df = plot_df.sort_values("datetime") #plot_df = plot_df.set_index("datetime") plot_df["Occupancy"] = plot_df["vehicle_occupancystatus"].map(categories) # Explicitly set the order for Y_category category_order = list(categories.values()) # ['Empty', 'Many seats available', ..., 'Full'] category_order.reverse() #st.line_chart(plot_df) # Create the Altair chart chart = alt.Chart(plot_df).mark_line(point=True, interpolate="step-after").encode( x=alt.X('stop_name:N', title="Stop name", sort=plot_df["stop_name"]), # Use column name as string y=alt.Y('Occupancy:N', title="Vehicle Occupancy Status (Categories)", sort=category_order, scale=alt.Scale(domain=category_order)), # Treat Y as categorical tooltip=["datetime", 'stop_name', 'Occupancy'] # Add tooltips for interactivity ).properties( title=f"Vehicle Occupancy For Bus arriving at {stop} at {time}" ) st.altair_chart(chart, use_container_width=True) def visualize(filtered_data, stop_name): import folium from streamlit_folium import st_folium categories = {0 : 'Empty', 1: 'Many seats available', 2:'Few seats available', 3:'Standing room only', 4:'Crushed standing room', 5: 'Full'} legend_html = '''
Occupancy Legend:
Empty
Many seats available
Few seats available
Standing room only
Crushed standing room
Full
Your stop
''' #st.markdown(legend_html, unsafe_allow_html=True) # Create a folium map centered around a location m = folium.Map(location=[filtered_data.iloc[0]["stop_lat"], filtered_data.iloc[0]["stop_lon"]], zoom_start=12) sw = filtered_data[['stop_lat', 'stop_lon']].min().values.tolist() ne = filtered_data[['stop_lat', 'stop_lon']].max().values.tolist() m.fit_bounds([sw, ne]) # Add bus stop markers based on filtered data for idx, row in filtered_data.iterrows(): if row["stop_name"] == stop_name: folium.Marker( [row['stop_lat'], row['stop_lon']], popup=f"Bus stop: {row['stop_name']} Bus occupancy: {categories[row['vehicle_occupancystatus']] }", icon = folium.Icon(color="black", icon="bus-simple", prefix="fa") ).add_to(m) elif row['vehicle_occupancystatus'] == 0: folium.Marker( [row['stop_lat'], row['stop_lon']], popup=f"Bus stop: {row['stop_name']} Bus occupancy: {categories[row['vehicle_occupancystatus']] }", icon = folium.Icon(color="green", icon="bus-simple", prefix="fa") ).add_to(m) elif row['vehicle_occupancystatus'] == 1: folium.Marker( [row['stop_lat'], row['stop_lon']], popup=f"Bus stop: {row['stop_name']} Bus occupancy: {categories[row['vehicle_occupancystatus']] }", icon = folium.Icon(color="blue", icon="bus-simple", prefix="fa") ).add_to(m) elif row['vehicle_occupancystatus'] == 2: folium.Marker( [row['stop_lat'], row['stop_lon']], popup=f"Bus stop: {row['stop_name']} Bus occupancy: {categories[row['vehicle_occupancystatus']] }", icon = folium.Icon(color="purple", icon="bus-simple", prefix="fa") ).add_to(m) elif row['vehicle_occupancystatus'] == 3: folium.Marker( [row['stop_lat'], row['stop_lon']], popup=f"Bus stop: {row['stop_name']} Bus occupancy: {categories[row['vehicle_occupancystatus']] }", icon = folium.Icon(color="pink", icon="bus-simple", prefix="fa") ).add_to(m) elif row['vehicle_occupancystatus'] == 4: folium.Marker( [row['stop_lat'], row['stop_lon']], popup=f"Bus stop: {row['stop_name']} Bus occupancy: {categories[row['vehicle_occupancystatus']] }", icon = folium.Icon(color="orange", icon="bus-simple", prefix="fa") ).add_to(m) elif row['vehicle_occupancystatus'] == 5: folium.Marker( [row['stop_lat'], row['stop_lon']], popup=f"Bus stop: {row['stop_name']} Bus occupancy: {categories[row['vehicle_occupancystatus']] }", icon = folium.Icon(color="red", icon="bus-simple", prefix="fa") ).add_to(m) else: folium.Marker( [row['stop_lat'], row['stop_lon']], popup=f"Bus stop: {row['stop_name']} Bus occupancy: {categories[row['vehicle_occupancystatus']] }", icon = folium.Icon(icon="bus-simple", prefix="fa") ).add_to(m) # Layout: Split screen into two columns col1, col2 = st.columns([3, 1]) # Adjust proportions (3:1) as needed # Display Folium map in the first column with col1: st_folium(m, width=700, height=500) # Display Legend in the second column with col2: st.markdown(legend_html, unsafe_allow_html=True) # Display the map #st_folium(m, width=700, height=500) def drop_the_duplicates(df): df = df.drop_duplicates("datetime") df["previous"] = df["datetime"].shift(1) df = df[((df["datetime"] - df["previous"]) > timedelta(minutes=3)) | (df["previous"].isna())] #df = df.drop_duplicates(["trip_id", "stop_name"]) return df # Streamlit UI def main(): st.title("Wheely Fun Times - Bus Occupancy Explorer") # Initialize session state if "hopsworks_project" not in st.session_state: st.session_state.hopsworks_project = None if "data" not in st.session_state: st.session_state.data = None # User inputs for feature group and version #st.sidebar.title("Data Settings") #feature_group_name = st.sidebar.text_input("Feature Group Name", value="predictions") #version = st.sidebar.number_input("Feature Group Version", value=1, min_value=1) #filename = st.sidebar.text_input("Local Filename", value="data.csv") # Check for valid local data if is_local_data_valid(): st.write("Using cached local data.") st.session_state.data = load_local_data("data.csv") if "first" not in st.session_state: st.session_state.first = True #st.session_state.data = remove_near_duplicates(st.session_state.data) else: # Fetch data if local data is invalid if st.session_state.hopsworks_project is None: st.write("Initializing Hopsworks connection...") st.session_state.hopsworks_project = connect_to_hopsworks() st.success("Connected to Hopsworks!") project = st.session_state.hopsworks_project data = fetch_data_from_feature_group(project, "predictions", 1) #print(data.head()) filepath = save_data_locally(data, "data.csv") st.session_state.data = data st.success(f"Data fetched and saved locally at {filepath}") buses_df, bus_list, short_bus = get_buses() short_bus = sorted(short_bus) # Sidebar section for searching buses st.sidebar.title("Search for your desired bus") # Create a multiselect dropdown in the sidebar search = st.sidebar.selectbox( "Search for your bus number:", options=short_bus, help="Select one bus to view details." ) # Display the results if search: route = bus_list[bus_list["route_short_name"]==search] long_names = list(pd.unique(route["route_long_name"])) if len(long_names)==1: bus = long_names[0] st.write(f"### Selected Bus: {search} {bus}") else: bus = st.sidebar.selectbox( "Pick bus route:", options=long_names, help="Select one bus to view details." ) st.write(f"### Selected Bus: {search} {bus}") # Streamlit checkbox to toggle bus direction if "direction" not in st.session_state: st.session_state.direction = False # Streamlit button to toggle bus direction if st.sidebar.button('Change Direction'): # Toggle between 'North' and 'South' st.session_state.direction = not st.session_state.direction print(st.session_state.direction) #Plocka alla aktuella trip_ids från buses trips = buses_df[buses_df["route_long_name"]==bus] bus_trips = st.session_state.data[st.session_state.data["route_long_name"]==bus] bus_trips["datetime"] = pd.to_datetime(bus_trips["datetime"]) bus_trips["datetime"] = bus_trips["datetime"].dt.tz_convert(None) stops = list(pd.unique(bus_trips["stop_name"])) stop_choice = st.sidebar.selectbox( "Select your bus stop:", options=stops, help="Select one bus stop to se occupancy." ) #direction = st.sidebar.checkbox('Direction of bus', value=True) today = datetime.now() tomorrow = today + timedelta(days=1) today = today.date() tomorrow = tomorrow.date() date_options = { today.strftime("%d %B %Y") : today, tomorrow.strftime("%d %B %Y") : tomorrow } day_choice = st.sidebar.radio("Select the day:", options=list(date_options.keys())) # Add time input widgets in the sidebar start_time = st.sidebar.time_input("Select a start time", value=None) end_time = st.sidebar.time_input("Select an end time", value=None) print(f"start time {type(start_time)}") print(f"end time {type(end_time)}") print(f"day {type(day_choice)}") if start_time != None and end_time != None: #TODO hur filtrera på tid? st.write(f"Displaying buses between {start_time.strftime('%H:%M')} and {end_time.strftime('%H:%M')} the {day_choice}") selected_trips = bus_trips[(bus_trips["datetime"] >= datetime.combine(date_options[day_choice], start_time)) & (bus_trips["datetime"] <= datetime.combine(date_options[day_choice], end_time)) & (bus_trips["direction_id"] == st.session_state.direction ) & (bus_trips["stop_name"] == stop_choice)] trip_ids = list(pd.unique(selected_trips["trip_id"])) #st.write(f"{len(trip_ids)} buses available") chioce = selected_trips[selected_trips["stop_name"]==stop_choice] #chioce.head() #TODO ta bort stop_name:) chioce = chioce[["trip_id", "stop_name", "datetime"]] #Ev lägga stop_chioce i session_state chioce = chioce.sort_values(by=["datetime"]) chioce = drop_the_duplicates(chioce) st.write(f"{chioce['trip_id'].nunique()} buses available") for idx, row in chioce.iterrows(): #st.write(f"The bus arrives at {row['stop_name']} at {row['datetime'].strftime('%H:%M')}") plot_graph_title(st.session_state.data[st.session_state.data["trip_id"]==row["trip_id"]], row["stop_name"], row['datetime'].strftime('%H:%M')) visualize(st.session_state.data[st.session_state.data["trip_id"]==row["trip_id"]], stop_choice) else: st.write("No buses selected. Please search in the sidebar.") # Display data and graphs #if st.session_state.data is not None: #plot_graphs(st.session_state.data) main() # Visa alla busslinjer? Söka? # Hur se riktning? # Filtrera på busslinje och riktning # Filtrera på tid # Ska användaren ange tid # Se alla unika trip ids # Mappa position till stop # Visa någon sorts graf för alla bussar inom den tiden # Ska det vara för alla stopp eller bara de som användaren angivit att den ska åka