Integrating Feature Engineering and Machine Learning for Enhanced Recommendations¶

InĀ [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import random
import matplotlib.pyplot as plt
InĀ [2]:
# Load the uploaded data files to examine their structure and content.
lines_df = pd.read_csv('lines.csv')
travel_times_df = pd.read_csv('travel_times.csv')
stations_df = pd.read_csv('stations.csv')

accuracy_threshold = 3.0  # Adjust threshold as needed
InĀ [3]:
lines_df
Out[3]:
line sequence station
0 blue 1 Dublin
1 blue 2 West Dublin
2 blue 3 Castro Valley
3 blue 4 Bay Fair
4 blue 5 San Leandro
... ... ... ...
109 yellow 23 Daly City
110 yellow 24 Colma
111 yellow 25 South San Francisco
112 yellow 26 San Bruno
113 yellow 27 SFO

114 rows Ɨ 3 columns

InĀ [4]:
travel_times_df
Out[4]:
station_1 station_2 travel_time
0 12th Street 19th Street 120
1 12th Street Lake Merritt 180
2 12th Street West Oakland 300
3 16th Street Mission 24th Street Mission 120
4 16th Street Mission Civic Center 180
5 19th Street MacArthur 180
6 24th Street Mission Glen Park 180
7 Antioch Pittsburg Center 420
8 Ashby Downtown Berkeley 180
9 Ashby MacArthur 240
10 Balboa Park Daly City 240
11 Balboa Park Glen Park 120
12 Bay Fair Castro Valley 240
13 Bay Fair Hayward 240
14 Bay Fair San Leandro 240
15 Berryessa Milpitas 300
16 Castro Valley West Dublin 600
17 Civic Center Powell Street 60
18 Coliseum Fruitvale 240
19 Coliseum OAK 480
20 Coliseum San Leandro 240
21 Colma Daly City 240
22 Colma South San Francisco 180
23 Concord North Concord 180
24 Concord Pleasant Hill 360
25 Downtown Berkeley North Berkeley 120
26 Dublin West Dublin 180
27 El Cerrito Plaza El Cerrito del Norte 180
28 El Cerrito Plaza North Berkeley 180
29 El Cerrito del Norte Richmond 300
30 Embarcadero Montgomery Street 60
31 Embarcadero West Oakland 420
32 Fremont Union City 300
33 Fremont Warm Springs 360
34 Fruitvale Lake Merritt 300
35 Hayward South Hayward 240
36 Lafayette Orinda 300
37 Lafayette Walnut Creek 300
38 Lake Merritt West Oakland 360
39 MacArthur Rockridge 240
40 Millbrae SFO 300
41 Millbrae San Bruno 420
42 Milpitas Warm Springs 540
43 Montgomery Street Powell Street 120
44 North Concord Pittsburg 360
45 Orinda Rockridge 300
46 Pittsburg Pittsburg Center 600
47 Pleasant Hill Walnut Creek 120
48 SFO San Bruno 240
49 San Bruno South San Francisco 240
50 South Hayward Union City 300
InĀ [5]:
travel_times_df
Out[5]:
station_1 station_2 travel_time
0 12th Street 19th Street 120
1 12th Street Lake Merritt 180
2 12th Street West Oakland 300
3 16th Street Mission 24th Street Mission 120
4 16th Street Mission Civic Center 180
5 19th Street MacArthur 180
6 24th Street Mission Glen Park 180
7 Antioch Pittsburg Center 420
8 Ashby Downtown Berkeley 180
9 Ashby MacArthur 240
10 Balboa Park Daly City 240
11 Balboa Park Glen Park 120
12 Bay Fair Castro Valley 240
13 Bay Fair Hayward 240
14 Bay Fair San Leandro 240
15 Berryessa Milpitas 300
16 Castro Valley West Dublin 600
17 Civic Center Powell Street 60
18 Coliseum Fruitvale 240
19 Coliseum OAK 480
20 Coliseum San Leandro 240
21 Colma Daly City 240
22 Colma South San Francisco 180
23 Concord North Concord 180
24 Concord Pleasant Hill 360
25 Downtown Berkeley North Berkeley 120
26 Dublin West Dublin 180
27 El Cerrito Plaza El Cerrito del Norte 180
28 El Cerrito Plaza North Berkeley 180
29 El Cerrito del Norte Richmond 300
30 Embarcadero Montgomery Street 60
31 Embarcadero West Oakland 420
32 Fremont Union City 300
33 Fremont Warm Springs 360
34 Fruitvale Lake Merritt 300
35 Hayward South Hayward 240
36 Lafayette Orinda 300
37 Lafayette Walnut Creek 300
38 Lake Merritt West Oakland 360
39 MacArthur Rockridge 240
40 Millbrae SFO 300
41 Millbrae San Bruno 420
42 Milpitas Warm Springs 540
43 Montgomery Street Powell Street 120
44 North Concord Pittsburg 360
45 Orinda Rockridge 300
46 Pittsburg Pittsburg Center 600
47 Pleasant Hill Walnut Creek 120
48 SFO San Bruno 240
49 San Bruno South San Francisco 240
50 South Hayward Union City 300

Simple Machine Learning Code to Predict Travel Times¶

InĀ [6]:
# Merge data for feature engineering
# Merge travel times with stations to get coordinates for both station_1 and station_2
merged_df = travel_times_df.merge(
    stations_df, left_on='station_1', right_on='station', how='left'
).rename(columns={'latitude': 'latitude_1', 'longitude': 'longitude_1', 'transfer_time': 'transfer_time_1'}).drop(columns='station')

merged_df = merged_df.merge(
    stations_df, left_on='station_2', right_on='station', how='left'
).rename(columns={'latitude': 'latitude_2', 'longitude': 'longitude_2', 'transfer_time': 'transfer_time_2'}).drop(columns='station')
InĀ [7]:
# Feature Engineering - Calculate Euclidean distance and add transfer times as features
merged_df['distance'] = np.sqrt(
    (merged_df['latitude_1'] - merged_df['latitude_2'])**2 + (merged_df['longitude_1'] - merged_df['longitude_2'])**2
)
merged_df['total_transfer_time'] = merged_df['transfer_time_1'] + merged_df['transfer_time_2']
InĀ [8]:
# Define features and target variable
X = merged_df[['distance', 'total_transfer_time']]
y = merged_df['travel_time']
InĀ [9]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
InĀ [10]:
# Train a simple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
Out[10]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
InĀ [11]:
# Predict on the test set
y_pred = model.predict(X_test)
y_pred
Out[11]:
array([163.13177687, 178.4070064 , 289.71760209, 283.03456263,
       285.32395436, 476.19269994, 159.65832726, 310.26528824,
       177.63457537, 156.98810855, 284.58260386])
InĀ [12]:
# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, r2
Out[12]:
(np.float64(87.09448907918099), 0.4594084785172443)

Recommendations for Feature Engineering and Model Improvement¶

InĀ [13]:
# Train a Random Forest model as a more complex alternative to Linear Regression
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)
Out[13]:
RandomForestRegressor(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(random_state=42)
InĀ [14]:
# Predict on the test set
y_pred_rf = random_forest_model.predict(X_test)
y_pred_rf
Out[14]:
array([166.2, 158.4, 312. , 345.6, 328.2, 480.6, 156.6, 299.4, 167.4,
       150.6, 292.2])
InĀ [15]:
# Evaluate the Random Forest model's performance
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mae_rf, r2_rf
Out[15]:
(np.float64(87.38181818181819), 0.4768515513126492)

Adding Additional Features¶

Peak/Off-peak indicators, other temporal data¶

InĀ [16]:
# Merging and feature engineering as before
merged_df = travel_times_df.merge(
    stations_df, left_on='station_1', right_on='station', how='left'
).rename(columns={'latitude': 'latitude_1', 'longitude': 'longitude_1', 'transfer_time': 'transfer_time_1'}).drop(columns='station')

merged_df = merged_df.merge(
    stations_df, left_on='station_2', right_on='station', how='left'
).rename(columns={'latitude': 'latitude_2', 'longitude': 'longitude_2', 'transfer_time': 'transfer_time_2'}).drop(columns='station')
InĀ [17]:
# Calculate distance and total transfer time as features
merged_df['distance'] = np.sqrt(
    (merged_df['latitude_1'] - merged_df['latitude_2'])**2 + (merged_df['longitude_1'] - merged_df['longitude_2'])**2
)
merged_df['total_transfer_time'] = merged_df['transfer_time_1'] + merged_df['transfer_time_2']

# Simulate peak/off-peak times
def simulate_peak_hours():
    hour = random.choice(range(24))
    return 1 if (7 <= hour <= 10) or (16 <= hour <= 19) else 0

merged_df['peak_hour'] = merged_df.apply(lambda x: simulate_peak_hours(), axis=1)
InĀ [18]:
merged_df
Out[18]:
station_1 station_2 travel_time latitude_1 longitude_1 transfer_time_1 latitude_2 longitude_2 transfer_time_2 distance total_transfer_time peak_hour
0 12th Street 19th Street 120 37.803608 -122.272006 282 37.807869 -122.268980 67 0.005226 349 1
1 12th Street Lake Merritt 180 37.803608 -122.272006 282 37.797773 -122.266588 309 0.007963 591 0
2 12th Street West Oakland 300 37.803608 -122.272006 282 37.804900 -122.295100 283 0.023130 565 1
3 16th Street Mission 24th Street Mission 120 37.764847 -122.420042 287 37.752000 -122.418700 277 0.012917 564 0
4 16th Street Mission Civic Center 180 37.764847 -122.420042 287 37.779861 -122.413498 325 0.016378 612 0
5 19th Street MacArthur 180 37.807869 -122.268980 67 37.828260 -122.267275 59 0.020462 126 0
6 24th Street Mission Glen Park 180 37.752000 -122.418700 277 37.733118 -122.433808 314 0.024182 591 0
7 Antioch Pittsburg Center 420 37.996281 -121.783404 0 38.018227 -121.890178 0 0.109006 0 1
8 Ashby Downtown Berkeley 180 37.853068 -122.269957 299 37.869799 -122.268197 323 0.016823 622 1
9 Ashby MacArthur 240 37.853068 -122.269957 299 37.828260 -122.267275 59 0.024953 358 0
10 Balboa Park Daly City 240 37.721667 -122.447500 48 37.706224 -122.468934 285 0.026418 333 1
11 Balboa Park Glen Park 120 37.721667 -122.447500 48 37.733118 -122.433808 314 0.017849 362 0
12 Bay Fair Castro Valley 240 37.697000 -122.126500 63 37.690748 -122.075679 0 0.051204 63 0
13 Bay Fair Hayward 240 37.697000 -122.126500 63 37.669700 -122.087000 284 0.048016 347 1
14 Bay Fair San Leandro 240 37.697000 -122.126500 63 37.721764 -122.160684 298 0.042211 361 0
15 Berryessa Milpitas 300 37.368361 -121.874655 288 37.410278 -121.891111 292 0.045031 580 0
16 Castro Valley West Dublin 600 37.690748 -122.075679 0 37.699726 -121.928273 0 0.147679 0 0
17 Civic Center Powell Street 60 37.779861 -122.413498 325 37.784000 -122.408000 286 0.006882 611 1
18 Coliseum Fruitvale 240 37.753611 -122.196944 54 37.774800 -122.224100 279 0.034444 333 1
19 Coliseum OAK 480 37.753611 -122.196944 54 37.713200 -122.212200 0 0.043195 54 0
20 Coliseum San Leandro 240 37.753611 -122.196944 54 37.721764 -122.160684 298 0.048260 352 1
21 Colma Daly City 240 37.684722 -122.466111 276 37.706224 -122.468934 285 0.021687 561 0
22 Colma South San Francisco 180 37.684722 -122.466111 276 37.664264 -122.444043 316 0.030092 592 1
23 Concord North Concord 180 37.973745 -122.029127 0 38.003273 -122.024599 0 0.029873 0 0
24 Concord Pleasant Hill 360 37.973745 -122.029127 0 37.928399 -122.055992 0 0.052707 0 1
25 Downtown Berkeley North Berkeley 120 37.869799 -122.268197 323 37.873915 -122.282552 289 0.014933 612 0
26 Dublin West Dublin 180 37.701663 -121.899232 0 37.699726 -121.928273 0 0.029106 0 0
27 El Cerrito Plaza El Cerrito del Norte 180 37.902694 -122.298968 280 37.925183 -122.316939 311 0.028787 591 0
28 El Cerrito Plaza North Berkeley 180 37.902694 -122.298968 280 37.873915 -122.282552 289 0.033132 569 0
29 El Cerrito del Norte Richmond 300 37.925183 -122.316939 311 37.936811 -122.353095 302 0.037980 613 1
30 Embarcadero Montgomery Street 60 37.793056 -122.397222 304 37.789355 -122.401942 313 0.005998 617 0
31 Embarcadero West Oakland 420 37.793056 -122.397222 304 37.804900 -122.295100 283 0.102807 587 0
32 Fremont Union City 300 37.557489 -121.976620 306 37.590746 -122.017282 293 0.052530 599 0
33 Fremont Warm Springs 360 37.557489 -121.976620 306 37.503000 -121.940000 278 0.065651 584 0
34 Fruitvale Lake Merritt 300 37.774800 -122.224100 279 37.797773 -122.266588 309 0.048301 588 0
35 Hayward South Hayward 240 37.669700 -122.087000 284 37.634362 -122.057172 319 0.046244 603 1
36 Lafayette Orinda 300 37.893186 -122.124614 0 37.878427 -122.183740 0 0.060940 0 1
37 Lafayette Walnut Creek 300 37.893186 -122.124614 0 37.905724 -122.067332 0 0.058638 0 0
38 Lake Merritt West Oakland 360 37.797773 -122.266588 309 37.804900 -122.295100 283 0.029389 592 0
39 MacArthur Rockridge 240 37.828260 -122.267275 59 37.844452 -122.252083 0 0.022203 59 0
40 Millbrae SFO 300 37.600300 -122.386700 0 37.616400 -122.391000 291 0.016664 291 0
41 Millbrae San Bruno 420 37.600300 -122.386700 0 37.638300 -122.416500 307 0.048291 307 1
42 Milpitas Warm Springs 540 37.410278 -121.891111 292 37.503000 -121.940000 278 0.104821 570 1
43 Montgomery Street Powell Street 120 37.789355 -122.401942 313 37.784000 -122.408000 286 0.008086 599 0
44 North Concord Pittsburg 360 38.003273 -122.024599 0 38.018869 -121.944208 0 0.081890 0 0
45 Orinda Rockridge 300 37.878427 -122.183740 0 37.844452 -122.252083 0 0.076322 0 1
46 Pittsburg Pittsburg Center 600 38.018869 -121.944208 0 38.018227 -121.890178 0 0.054034 0 0
47 Pleasant Hill Walnut Creek 120 37.928399 -122.055992 0 37.905724 -122.067332 0 0.025353 0 0
48 SFO San Bruno 240 37.616400 -122.391000 291 37.638300 -122.416500 307 0.033613 598 0
49 San Bruno South San Francisco 240 37.638300 -122.416500 307 37.664264 -122.444043 316 0.037852 623 0
50 South Hayward Union City 300 37.634362 -122.057172 319 37.590746 -122.017282 293 0.059106 612 0
InĀ [19]:
# Define features and target variable with the additional peak hour indicator
X = merged_df[['distance', 'total_transfer_time', 'peak_hour']]
y = merged_df['travel_time']

# Split the data again for the new features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
InĀ [20]:
# Train the Random Forest model with additional features
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)
Out[20]:
RandomForestRegressor(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(random_state=42)
InĀ [21]:
# Predict and evaluate the model
y_pred_rf = random_forest_model.predict(X_test)
mae_rf_peak = mean_absolute_error(y_test, y_pred_rf)
r2_rf_peak = r2_score(y_test, y_pred_rf)

mae_rf_peak, r2_rf_peak
Out[21]:
(np.float64(88.14545454545454), 0.4764787589498808)
InĀ [22]:
# Define an accuracy threshold to classify regression predictions as accurate (1) or inaccurate (0)
# Here, any prediction within the threshold is considered accurate
accuracy_threshold = 3.0  # Adjust threshold as needed

# Hypothetical continuous predictions for each model
# Converting continuous predictions to binary classification based on threshold
y_pred_continuous_lr = [2.5, 3.1, 4.2, 2.9, 3.8, 1.1, 3.5, 2.8, 4.0, 3.3]  # Preds for Linear Regression
y_pred_continuous_rf = [1.9, 3.0, 4.1, 2.6, 3.4, 0.9, 3.1, 2.4, 4.3, 3.2]  # Preds for Random Forest
y_pred_continuous_rf_enhanced = [2.0, 3.2, 3.8, 2.5, 3.5, 1.0, 3.0, 2.7, 4.1, 3.3]  # Preds for Enhanced RF

# Converting to binary classification (1 = accurate, 0 = inaccurate)
y_test_binary = [1, 1, 0, 0, 1, 1, 0, 1, 0, 1]
y_pred_binary_lr = [int(abs(pred - true) <= accuracy_threshold) for pred, true in zip(y_pred_continuous_lr, y_test_binary)]
y_pred_binary_rf = [int(abs(pred - true) <= accuracy_threshold) for pred, true in zip(y_pred_continuous_rf, y_test_binary)]
y_pred_binary_rf_enhanced = [int(abs(pred - true) <= accuracy_threshold) for pred, true in zip(y_pred_continuous_rf_enhanced, y_test_binary)]

# Calculating metrics
metrics = {
    'Algorithm': ['Linear Regression', 'Random Forest', 'Random Forest (Enhanced)'],
    'Precision': [
        precision_score(y_test_binary, y_pred_binary_lr),
        precision_score(y_test_binary, y_pred_binary_rf),
        precision_score(y_test_binary, y_pred_binary_rf_enhanced)
    ],
    'Recall': [
        recall_score(y_test_binary, y_pred_binary_lr),
        recall_score(y_test_binary, y_pred_binary_rf),
        recall_score(y_test_binary, y_pred_binary_rf_enhanced)
    ],
    'F1 Score': [
        f1_score(y_test_binary, y_pred_binary_lr),
        f1_score(y_test_binary, y_pred_binary_rf),
        f1_score(y_test_binary, y_pred_binary_rf_enhanced)
    ],
    'R2 Score': [
        r2_score(y_test_binary, y_pred_continuous_lr),
        r2_score(y_test_binary, y_pred_continuous_rf),
        r2_score(y_test_binary, y_pred_continuous_rf_enhanced)
    ],
    'MSE': [
        mean_absolute_error(y_test_binary, y_pred_continuous_lr),
        mean_absolute_error(y_test_binary, y_pred_continuous_rf),
        mean_absolute_error(y_test_binary, y_pred_continuous_rf_enhanced)
    ]
}

# Convert to DataFrame for display and further plotting
metrics_df = pd.DataFrame(metrics)
InĀ [23]:
metrics_df
Out[23]:
Algorithm Precision Recall F1 Score R2 Score MSE
0 Linear Regression 0.857143 1.0 0.923077 -31.225000 2.52
1 Random Forest 0.857143 1.0 0.923077 -27.770833 2.31
2 Random Forest (Enhanced) 0.750000 1.0 0.857143 -26.820833 2.31
InĀ [24]:
# Plotting the updated classification metrics for each algorithm
# Precision
plt.figure(figsize=(10, 6))
plt.bar(metrics_df['Algorithm'], metrics_df['Precision'])
plt.xlabel('Algorithm')
plt.ylabel('Precision')
plt.title('Algorithm Performance Comparison - Precision')
plt.show()

# Recall
plt.figure(figsize=(10, 6))
plt.bar(metrics_df['Algorithm'], metrics_df['Recall'])
plt.xlabel('Algorithm')
plt.ylabel('Recall')
plt.title('Algorithm Performance Comparison - Recall')
plt.show()

# F1 Score
plt.figure(figsize=(10, 6))
plt.bar(metrics_df['Algorithm'], metrics_df['F1 Score'])
plt.xlabel('Algorithm')
plt.ylabel('F1 Score')
plt.title('Algorithm Performance Comparison - F1 Score')
plt.show()

# R2 Score
plt.figure(figsize=(10, 6))
plt.bar(metrics_df['Algorithm'], metrics_df['R2 Score'])
plt.xlabel('Algorithm')
plt.ylabel('R2 Score')
plt.title('Algorithm Performance Comparison - R2 Score')
plt.show()

# MSE
plt.figure(figsize=(10, 6))
plt.bar(metrics_df['Algorithm'], metrics_df['MSE'])
plt.xlabel('Algorithm')
plt.ylabel('MSE')
plt.title('Algorithm Performance Comparison - MSE')
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
InĀ [27]:
# Repeat data enrichment for feature engineering
# Step 1: Enrich travel times with station coordinates and transfer times by merging with the stations data
enriched_travel_times = travel_times_df.merge(stations_df, left_on='station_1', right_on='station', suffixes=('', '_1'))
enriched_travel_times = enriched_travel_times.rename(columns={
    'latitude': 'latitude_1', 'longitude': 'longitude_1', 'transfer_time': 'transfer_time_1'
}).drop(columns=['station'])

enriched_travel_times = enriched_travel_times.merge(stations_df, left_on='station_2', right_on='station', suffixes=('', '_2'))
enriched_travel_times = enriched_travel_times.rename(columns={
    'latitude': 'latitude_2', 'longitude': 'longitude_2', 'transfer_time': 'transfer_time_2'
}).drop(columns=['station'])

enriched_travel_times = enriched_travel_times.merge(lines_df, left_on='station_1', right_on='station', how='left')
enriched_travel_times = enriched_travel_times.rename(columns={'line': 'line_1', 'sequence': 'sequence_1'}).drop(columns=['station'])

enriched_travel_times = enriched_travel_times.merge(lines_df, left_on='station_2', right_on='station', how='left')
enriched_travel_times = enriched_travel_times.rename(columns={'line': 'line_2', 'sequence': 'sequence_2'}).drop(columns=['station'])

# Create final aggregated dataset with additional features
aggregated_data = enriched_travel_times[[
    'station_1', 'station_2', 'travel_time',
    'latitude_1', 'longitude_1', 'transfer_time_1',
    'latitude_2', 'longitude_2', 'transfer_time_2',
    'sequence_1', 'sequence_2'
]].copy()

aggregated_data['distance'] = ((aggregated_data['latitude_2'] - aggregated_data['latitude_1'])**2 + 
                               (aggregated_data['longitude_2'] - aggregated_data['longitude_1'])**2)**0.5
aggregated_data['lat_diff'] = aggregated_data['latitude_2'] - aggregated_data['latitude_1']
aggregated_data['lon_diff'] = aggregated_data['longitude_2'] - aggregated_data['longitude_1']
aggregated_data['peak_indicator'] = (aggregated_data['travel_time'] > aggregated_data['travel_time'].median()).astype(int)

# Adding anomaly detection using Isolation Forest
from sklearn.ensemble import IsolationForest
anomaly_detector = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
anomaly_detector.fit(aggregated_data[['distance', 'transfer_time_1', 'transfer_time_2', 'lat_diff', 'lon_diff', 'peak_indicator']])
aggregated_data['anomaly'] = anomaly_detector.predict(aggregated_data[['distance', 'transfer_time_1', 'transfer_time_2', 'lat_diff', 'lon_diff', 'peak_indicator']])

# Now that we have re-created the data, proceed with the visualizations
plt.figure(figsize=(10, 6))
plt.scatter(aggregated_data['distance'], aggregated_data['travel_time'], 
            c=aggregated_data['peak_indicator'], cmap='coolwarm', alpha=0.7)
plt.colorbar(label='Peak Indicator')
plt.xlabel('Distance (Euclidean between stations)')
plt.ylabel('Travel Time')
plt.title('Travel Time vs Distance with Peak Indicator')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(aggregated_data['distance'], aggregated_data['travel_time'], 
            c=aggregated_data['anomaly'], cmap='bwr', alpha=0.7)
plt.colorbar(label='Anomaly Detection (-1: Anomaly, 1: Normal)')
plt.xlabel('Distance (Euclidean between stations)')
plt.ylabel('Travel Time')
plt.title('Anomaly Detection in Travel Times')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image
InĀ [2]:
import folium
import time
from IPython.display import display

# Initial coordinates (e.g., starting location of the robot)
start_location = [37.7749, -122.4194]  # Replace with desired starting latitude and longitude

# List of coordinates to simulate movement along a path
path = [
    [37.7749, -122.4194],
    [37.7755, -122.4185],
    [37.7760, -122.4170],
    [37.7770, -122.4150],
    [37.7780, -122.4130]  # Add more points for longer routes
]

# Create the initial map centered on the starting location
m = folium.Map(location=start_location, zoom_start=15)

# Function to add a marker that represents the current location of the delivery robot
def update_marker(location):
    folium.Marker(location=location, popup="Delivery Robot", icon=folium.Icon(color="red")).add_to(m)
    display(m)

# Simulate the movement along the path by updating the marker location every few seconds
for location in path:
    update_marker(location)
    time.sleep(2)  # Wait 2 seconds between updates to simulate movement
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook