File size: 16,409 Bytes
064a25d
e4fbfab
 
 
 
 
 
 
2bd6eac
064a25d
effa819
 
e4fbfab
 
 
 
 
 
 
 
6f35e8c
8aa409a
7030e08
e4fbfab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
effa819
 
 
 
 
 
 
 
e4fbfab
8769306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5184bfe
8769306
5184bfe
 
 
 
 
 
 
 
 
 
 
8769306
 
5184bfe
 
8769306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5184bfe
 
39a6a86
 
8769306
7762bc9
 
 
 
 
 
39a6a86
8769306
7762bc9
39a6a86
 
 
7762bc9
 
 
 
 
39a6a86
 
7762bc9
 
 
 
39a6a86
 
 
 
e4fbfab
8769306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc58789
7762bc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc58789
 
 
 
e97cfa6
cc58789
 
 
 
 
 
 
7762bc9
 
 
 
e4fbfab
 
7762bc9
e4fbfab
8dd2873
e4fbfab
 
 
8dd2873
e4fbfab
 
618cd91
 
 
 
 
e4fbfab
8dd2873
e4fbfab
618cd91
5184bfe
 
8769306
e4fbfab
 
 
 
 
 
 
 
7edb958
effa819
618cd91
e4fbfab
 
 
effa819
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39a6a86
7762bc9
 
 
 
 
 
 
 
 
 
8769306
 
 
 
 
 
 
 
 
 
 
 
7762bc9
 
39a6a86
 
 
 
 
 
 
 
 
effa819
39a6a86
effa819
39a6a86
 
 
effa819
8769306
39a6a86
 
8769306
 
 
 
 
 
39a6a86
7762bc9
39a6a86
 
 
 
 
 
 
8769306
 
 
7762bc9
 
8769306
 
39a6a86
8769306
 
 
cc58789
8769306
 
 
 
 
 
 
 
 
 
39a6a86
 
 
cc58789
2bd6eac
effa819
 
 
 
 
 
 
e4fbfab
 
39a6a86
 
e4fbfab
 
d5c7130
 
 
 
 
 
 
 
 
8dd2873
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
import streamlit as st
import hopsworks
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import altair as alt

import api

# Constants
DATA_DIR = "data"
TIMESTAMP_FILE = "last_download_time.txt"

# Initialize Hopsworks connection
def connect_to_hopsworks():
    st.write("Connecting to Hopsworks...")
    project_name = "id2223AirQuality"
    HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")
    print(f"HOPSWORKS_API_KEY {HOPSWORKS_API_KEY}")
    project = hopsworks.login(project="id2223AirQuality", api_key_value = os.getenv('HOPSWORKS_API_KEY'))
    return project

# Fetch data from Hopsworks feature group
def fetch_data_from_feature_group(project, feature_group_name, version):
    feature_store = project.get_feature_store()
    feature_group = feature_store.get_feature_group(name=feature_group_name, version=version)
    data = feature_group.read()
    return data

# Save data locally
def save_data_locally(data, filename):
    os.makedirs(DATA_DIR, exist_ok=True)
    filepath = os.path.join(DATA_DIR, filename)
    data.to_csv(filepath, index=False)
    
    # Save timestamp
    timestamp_path = os.path.join(DATA_DIR, TIMESTAMP_FILE)
    with open(timestamp_path, "w") as f:
        f.write(str(datetime.now()))
    return filepath

# Load local data
def load_local_data(filename):
    filepath = os.path.join(DATA_DIR, filename)
    if os.path.exists(filepath):
        return pd.read_csv(filepath)
    else:
        return None

# Check if local data is valid
def is_local_data_valid():
    timestamp_path = os.path.join(DATA_DIR, TIMESTAMP_FILE)
    if not os.path.exists(timestamp_path):
        return False
    try:
        with open(timestamp_path, "r") as f:
            last_download_time = datetime.fromisoformat(f.read().strip())
        # Check if the data is more than a day old
        if datetime.now() - last_download_time > timedelta(days=1):
            return False
        return True
    except Exception as e:
        st.warning(f"Error reading timestamp: {e}")
        return False
    
def get_buses():
    bus_df = st.session_state.data[["trip_id", "route_long_name", "route_short_name"]]
    bus_df = bus_df.drop_duplicates()
    bus_list = bus_df[["route_long_name", "route_short_name"]]
    bus_list = bus_list.drop_duplicates()
    short_bus_list = list(pd.unique(bus_df["route_short_name"]))
    return bus_df, bus_list, short_bus_list

# Function to remove duplicates
def remove_near_duplicates(data):
    print(data["trip_id"].nunique())
    result = []
    data["datetime"] = pd.to_datetime(data["datetime"])
    for _, group in data.groupby(['route_id', 'stop_name']):
        # Initialize a list to store rows that are not duplicates
        filtered_rows = []
        last_row = None

        for idx, row in group.iterrows():
            if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)):
                # Keep the row if it's the first or sufficiently far apart in time
                filtered_rows.append(row)
                last_row = row

        # Add filtered rows to the result
        result.extend(filtered_rows)
    filtered_df = pd.DataFrame(result)
    # Return the filtered dataframe
    print(filtered_df["trip_id"].nunique())
    return filtered_df

def remove_duplicate_trips(df, route_id_col="route_id", trip_id_col="trip_id", datetime_col="datetime", time_window='3min'):
    """
    Removes duplicate trips based on route_id and starting time proximity within a time window.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame containing trip data.
        route_id_col (str): Column name for route IDs.
        trip_id_col (str): Column name for trip IDs.
        datetime_col (str): Column name for departure times.
        time_window (str): Time window for considering trips as duplicates (e.g., '3min').
    
    Returns:
        pd.DataFrame: Filtered DataFrame with duplicates removed.
    """
    print(df["trip_id"].nunique())

    # Ensure the datetime column is of datetime type
    df[datetime_col] = pd.to_datetime(df[datetime_col])

    # Sort by route_id and datetime for correct chronological order within each route
    df = df.sort_values(by=[route_id_col, datetime_col])

    # Calculate time differences between consecutive rows within each route_id group
    df['time_diff'] = df.groupby(route_id_col)[datetime_col].diff().fillna(pd.Timedelta('0s'))

    # Mark rows as duplicates if the time difference is within the time window
    time_window_timedelta = pd.to_timedelta(time_window)
    df['is_duplicate'] = df['time_diff'] <= time_window_timedelta

    # Keep only the first row within each group of duplicates (based on time window)
    df['keep'] = ~df.groupby(route_id_col)['is_duplicate'].transform('any')

    # Filter rows: Keep only those that are marked as 'keep'
    result = df[df['keep']].drop(columns=['time_diff', 'is_duplicate', 'keep'])

    print(result["trip_id"].nunique())
    return result

def plot_graph(plot_df):
    #Nu vill vi plotta!
    #TODO ska den bara visa de stopp man vill eller alla?
    categories =  {0 : 'Empty',
    1: 'Many seats available',
    2:'Few seats available',
    3:'Standing room only',
    4:'Crushed standing room',
    5: 'Full'}
    
    plot_df = plot_df[["datetime", "vehicle_occupancystatus", "stop_name", "route_id"]]
    plot_df = plot_df.sort_values("datetime")
    st.write(plot_df.head())
    st.write(plot_df.tail())
    #plot_df = plot_df.set_index("datetime")
    plot_df["Occupancy"] = plot_df["vehicle_occupancystatus"].map(categories)
    # Explicitly set the order for Y_category
    category_order = list(categories.values())  # ['Empty', 'Many seats available', ..., 'Full']
    category_order.reverse()

    #st.line_chart(plot_df)
    # Create the Altair chart
    chart = alt.Chart(plot_df).mark_line(point=True, interpolate="step-after").encode(
        x=alt.X('stop_name:N', title="Stop name"),  # Use column name as string
        y=alt.Y('Occupancy:N', title="Vehicle Occupancy Status (Categories)", sort=category_order, scale=alt.Scale(domain=category_order)),  # Treat Y as categorical
        tooltip=["datetime", 'stop_name', 'Occupancy']  # Add tooltips for interactivity
    ).properties(
        title="Vehicle Occupancy Status Over Time"
    )
    st.altair_chart(chart, use_container_width=True)

def plot_graph_title(plot_df, stop, time):
    #Nu vill vi plotta!
    #TODO ska den bara visa de stopp man vill eller alla?
    categories =  {0 : 'Empty',
    1: 'Many seats available',
    2:'Few seats available',
    3:'Standing room only',
    4:'Crushed standing room',
    5: 'Full'}
    
    plot_df = plot_df[["datetime", "vehicle_occupancystatus", "stop_name", "route_id"]]
    plot_df = plot_df.sort_values("datetime")
    #plot_df = plot_df.set_index("datetime")
    plot_df["Occupancy"] = plot_df["vehicle_occupancystatus"].map(categories)
    # Explicitly set the order for Y_category
    category_order = list(categories.values())  # ['Empty', 'Many seats available', ..., 'Full']
    category_order.reverse()

    #st.line_chart(plot_df)
    # Create the Altair chart
    chart = alt.Chart(plot_df).mark_line(point=True, interpolate="step-after").encode(
        x=alt.X('stop_name:N', title="Stop name"),  # Use column name as string
        y=alt.Y('Occupancy:N', title="Vehicle Occupancy Status (Categories)", sort=category_order, scale=alt.Scale(domain=category_order)),  # Treat Y as categorical
        tooltip=["datetime", 'stop_name', 'Occupancy']  # Add tooltips for interactivity
    ).properties(
        title=f"Vehicle Occupancy For Bus arriving at {stop} at {time}"
    )
    st.altair_chart(chart, use_container_width=True)


def visualize(filtered_data, stop_name):
    import folium
    from streamlit_folium import st_folium

    categories =  {0 : 'Empty',
    1: 'Many seats available',
    2:'Few seats available',
    3:'Standing room only',
    4:'Crushed standing room',
    5: 'Full'}

    # Create a folium map centered around a location
    m = folium.Map(location=[filtered_data.iloc[0]["stop_lat"], filtered_data.iloc[0]["stop_lon"]], zoom_start=12)

    sw = filtered_data[['stop_lat', 'stop_lon']].min().values.tolist()
    ne = filtered_data[['stop_lat', 'stop_lon']].max().values.tolist()

    m.fit_bounds([sw, ne])

    # Add bus stop markers based on filtered data
    for idx, row in filtered_data.iterrows():
        if row["stop_name"] == stop_name:
            folium.Marker(
                [row['stop_lat'], row['stop_lon']],
                popup=f"Bus stop: {row['stop_name']} Bus occupancy: {categories[row['vehicle_occupancystatus']] }",
                icon = folium.Icon(color="darkpurple", icon="bus-simple", prefix="fa")
            ).add_to(m)
        else:
            folium.Marker(
                [row['stop_lat'], row['stop_lon']],
                popup=f"Bus stop: {row['stop_name']} Bus occupancy: {categories[row['vehicle_occupancystatus']] }",
                icon = folium.Icon(icon="bus-simple", prefix="fa")
            ).add_to(m)

    # Display the map
    st_folium(m, width=700, height=500)

# Streamlit UI
def main():
    st.title("Wheely Fun Times - Bus Occupancy Explorer")
    
    # Initialize session state
    if "hopsworks_project" not in st.session_state:
        st.session_state.hopsworks_project = None
    if "data" not in st.session_state:
        st.session_state.data = None

    # User inputs for feature group and version
    #st.sidebar.title("Data Settings")
    #feature_group_name = st.sidebar.text_input("Feature Group Name", value="predictions")
    #version = st.sidebar.number_input("Feature Group Version", value=1, min_value=1)
    #filename = st.sidebar.text_input("Local Filename", value="data.csv")
    
    # Check for valid local data
    if is_local_data_valid():
        st.write("Using cached local data.")
        st.session_state.data = load_local_data("data.csv")
        if "first" not in st.session_state:
            st.session_state.first = True
            #st.session_state.data = remove_near_duplicates(st.session_state.data)
    else:
        # Fetch data if local data is invalid
        if st.session_state.hopsworks_project is None:
            st.write("Initializing Hopsworks connection...")
            st.session_state.hopsworks_project = connect_to_hopsworks()
            st.success("Connected to Hopsworks!")
        
        project = st.session_state.hopsworks_project
        data = fetch_data_from_feature_group(project, "predictions", 1)
        #print(data.head())
        filepath = save_data_locally(data, "data.csv")
        st.session_state.data = data
        st.success(f"Data fetched and saved locally at {filepath}")

    buses_df, bus_list, short_bus = get_buses()

    # Sidebar section for searching buses
    st.sidebar.title("Search for your desired bus")

    # Create a multiselect dropdown in the sidebar
    search = st.sidebar.selectbox(
        "Search for your bus number:",
        options=short_bus,
        help="Select one bus to view details."
    )

    # Display the results
    if search:
        route = bus_list[bus_list["route_short_name"]==search]
        long_names = list(pd.unique(route["route_long_name"]))
        if len(long_names)==1:
            bus = long_names[0]
            st.write("### Selected Bus")
            st.write(f"{search}: {bus}")
        else:
            bus = st.sidebar.selectbox(
            "Pick bus route:",
            options=long_names,
            help="Select one bus to view details."
            )
            st.write("### Selected Bus")
            st.write(f"{search}: {bus}")
        
        # Streamlit checkbox to toggle bus direction
        if "direction" not in st.session_state:
            st.session_state.direction = False
        
        # Streamlit button to toggle bus direction
        if st.sidebar.button('Change Direction'):
            # Toggle between 'North' and 'South'
            st.session_state.direction = not st.session_state.direction
            print(st.session_state.direction)

        #Plocka alla aktuella trip_ids från buses
        trips = buses_df[buses_df["route_long_name"]==bus]
        bus_trips = st.session_state.data[st.session_state.data["route_long_name"]==bus]
        bus_trips["datetime"] = pd.to_datetime(bus_trips["datetime"])
        bus_trips["datetime"] = bus_trips["datetime"].dt.tz_convert(None)

        stops = list(pd.unique(bus_trips["stop_name"]))
        stop_choice = st.sidebar.selectbox(
            "Select your bus stop:",
            options=stops,
            help="Select one bus stop to se occupancy."
            )  
        #direction = st.sidebar.checkbox('Direction of bus', value=True) 

        today = datetime.now()
        tomorrow = today + timedelta(days=1)
        today = today.date()
        tomorrow = tomorrow.date()
        
        date_options = {
            today.strftime("%d %B %Y") : today,
            tomorrow.strftime("%d %B %Y") : tomorrow
        }

        day_choice = st.sidebar.radio("Select the day:", options=list(date_options.keys()))

        # Add time input widgets in the sidebar
        start_time = st.sidebar.time_input("Select a start time", value=None)
        end_time = st.sidebar.time_input("Select an end time", value=None)

        
        
        #TODO remove
        #trip_ids = list(trips["trip_id"])
        #plot_df = st.session_state.data[st.session_state.data["trip_id"]==trip_ids[0]]

        #TODO hållsplats
        #Kolla på route_id för att plocka alla hållplatser



        print(f"start time {type(start_time)}")
        print(f"end time {type(end_time)}")
        print(f"day {type(day_choice)}")

        if start_time != None and end_time != None:
            #TODO hur filtrera på tid?
            st.write(f"Displaying buses between {start_time.strftime('%H:%M')} and {end_time.strftime('%H:%M')} the {day_choice}")
            """selected_trips = bus_trips[(bus_trips["datetime"] >= datetime.combine(date_options[day_choice], start_time)) 
                                       & (bus_trips["datetime"] <= datetime.combine(date_options[day_choice], end_time))
                                       & (bus_trips["direction_id"] == st.session_state.direction )]"""
            selected_trips = bus_trips[(bus_trips["datetime"] >= datetime.combine(date_options[day_choice], start_time)) 
                                       & (bus_trips["datetime"] <= datetime.combine(date_options[day_choice], end_time))
                                       & (bus_trips["direction_id"] == st.session_state.direction )
                                       & (bus_trips["stop_name"] == stop_choice)]
            trip_ids = list(pd.unique(selected_trips["trip_id"]))

            chioce = selected_trips[selected_trips["stop_name"]==stop_choice]
            chioce.head()
            #TODO ta bort stop_name:)
            chioce = chioce[["trip_id", "stop_name", "datetime"]]
            #Ev lägga stop_chioce i session_state

            chioce = chioce.sort_values(by=["datetime"])
            chioce = chioce.drop_duplicates("datetime")

            for idx, row in chioce.iterrows():
                st.write(f"The bus arrives at {row['stop_name']} at {row['datetime'].strftime('%H:%M')}")
                plot_graph_title(st.session_state.data[st.session_state.data["trip_id"]==row["trip_id"]], row["stop_name"], row['datetime'].strftime('%H:%M'))

            st.write(f"Length {len(trip_ids)}")
            for id in trip_ids:
                plot_graph(st.session_state.data[st.session_state.data["trip_id"]==id])
                visualize(st.session_state.data[st.session_state.data["trip_id"]==id], stop_choice)
            


    else:
        st.write("No buses selected. Please search in the sidebar.")
    
    

    # Display data and graphs
    if st.session_state.data is not None:
        #plot_graphs(st.session_state.data)
        st.write("Hi")

main()

# Visa alla busslinjer? Söka?
    # Hur se riktning?
# Filtrera på busslinje och riktning
# Filtrera på tid 
    # Ska användaren ange tid
# Se alla unika trip ids
# Mappa position till stop
# Visa någon sorts graf för alla bussar inom den tiden
    # Ska det vara för alla stopp eller bara de som användaren angivit att den ska åka