Integrating Feature Engineering and Machine Learning for Enhanced RecommendationsĀ¶
InĀ [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import random
import matplotlib.pyplot as plt
InĀ [2]:
# Load the uploaded data files to examine their structure and content.
lines_df = pd.read_csv('lines.csv')
travel_times_df = pd.read_csv('travel_times.csv')
stations_df = pd.read_csv('stations.csv')
accuracy_threshold = 3.0 # Adjust threshold as needed
InĀ [3]:
line | sequence | station | |
0 | blue | 1 | Dublin |
1 | blue | 2 | West Dublin |
2 | blue | 3 | Castro Valley |
3 | blue | 4 | Bay Fair |
4 | blue | 5 | San Leandro |
... | ... | ... | ... |
109 | yellow | 23 | Daly City |
110 | yellow | 24 | Colma |
111 | yellow | 25 | South San Francisco |
112 | yellow | 26 | San Bruno |
113 | yellow | 27 | SFO |
114 rows Ć 3 columns
InĀ [4]:
station_1 | station_2 | travel_time | |
0 | 12th Street | 19th Street | 120 |
1 | 12th Street | Lake Merritt | 180 |
2 | 12th Street | West Oakland | 300 |
3 | 16th Street Mission | 24th Street Mission | 120 |
4 | 16th Street Mission | Civic Center | 180 |
5 | 19th Street | MacArthur | 180 |
6 | 24th Street Mission | Glen Park | 180 |
7 | Antioch | Pittsburg Center | 420 |
8 | Ashby | Downtown Berkeley | 180 |
9 | Ashby | MacArthur | 240 |
10 | Balboa Park | Daly City | 240 |
11 | Balboa Park | Glen Park | 120 |
12 | Bay Fair | Castro Valley | 240 |
13 | Bay Fair | Hayward | 240 |
14 | Bay Fair | San Leandro | 240 |
15 | Berryessa | Milpitas | 300 |
16 | Castro Valley | West Dublin | 600 |
17 | Civic Center | Powell Street | 60 |
18 | Coliseum | Fruitvale | 240 |
19 | Coliseum | OAK | 480 |
20 | Coliseum | San Leandro | 240 |
21 | Colma | Daly City | 240 |
22 | Colma | South San Francisco | 180 |
23 | Concord | North Concord | 180 |
24 | Concord | Pleasant Hill | 360 |
25 | Downtown Berkeley | North Berkeley | 120 |
26 | Dublin | West Dublin | 180 |
27 | El Cerrito Plaza | El Cerrito del Norte | 180 |
28 | El Cerrito Plaza | North Berkeley | 180 |
29 | El Cerrito del Norte | Richmond | 300 |
30 | Embarcadero | Montgomery Street | 60 |
31 | Embarcadero | West Oakland | 420 |
32 | Fremont | Union City | 300 |
33 | Fremont | Warm Springs | 360 |
34 | Fruitvale | Lake Merritt | 300 |
35 | Hayward | South Hayward | 240 |
36 | Lafayette | Orinda | 300 |
37 | Lafayette | Walnut Creek | 300 |
38 | Lake Merritt | West Oakland | 360 |
39 | MacArthur | Rockridge | 240 |
40 | Millbrae | SFO | 300 |
41 | Millbrae | San Bruno | 420 |
42 | Milpitas | Warm Springs | 540 |
43 | Montgomery Street | Powell Street | 120 |
44 | North Concord | Pittsburg | 360 |
45 | Orinda | Rockridge | 300 |
46 | Pittsburg | Pittsburg Center | 600 |
47 | Pleasant Hill | Walnut Creek | 120 |
48 | SFO | San Bruno | 240 |
49 | San Bruno | South San Francisco | 240 |
50 | South Hayward | Union City | 300 |
InĀ [5]:
station_1 | station_2 | travel_time | |
0 | 12th Street | 19th Street | 120 |
1 | 12th Street | Lake Merritt | 180 |
2 | 12th Street | West Oakland | 300 |
3 | 16th Street Mission | 24th Street Mission | 120 |
4 | 16th Street Mission | Civic Center | 180 |
5 | 19th Street | MacArthur | 180 |
6 | 24th Street Mission | Glen Park | 180 |
7 | Antioch | Pittsburg Center | 420 |
8 | Ashby | Downtown Berkeley | 180 |
9 | Ashby | MacArthur | 240 |
10 | Balboa Park | Daly City | 240 |
11 | Balboa Park | Glen Park | 120 |
12 | Bay Fair | Castro Valley | 240 |
13 | Bay Fair | Hayward | 240 |
14 | Bay Fair | San Leandro | 240 |
15 | Berryessa | Milpitas | 300 |
16 | Castro Valley | West Dublin | 600 |
17 | Civic Center | Powell Street | 60 |
18 | Coliseum | Fruitvale | 240 |
19 | Coliseum | OAK | 480 |
20 | Coliseum | San Leandro | 240 |
21 | Colma | Daly City | 240 |
22 | Colma | South San Francisco | 180 |
23 | Concord | North Concord | 180 |
24 | Concord | Pleasant Hill | 360 |
25 | Downtown Berkeley | North Berkeley | 120 |
26 | Dublin | West Dublin | 180 |
27 | El Cerrito Plaza | El Cerrito del Norte | 180 |
28 | El Cerrito Plaza | North Berkeley | 180 |
29 | El Cerrito del Norte | Richmond | 300 |
30 | Embarcadero | Montgomery Street | 60 |
31 | Embarcadero | West Oakland | 420 |
32 | Fremont | Union City | 300 |
33 | Fremont | Warm Springs | 360 |
34 | Fruitvale | Lake Merritt | 300 |
35 | Hayward | South Hayward | 240 |
36 | Lafayette | Orinda | 300 |
37 | Lafayette | Walnut Creek | 300 |
38 | Lake Merritt | West Oakland | 360 |
39 | MacArthur | Rockridge | 240 |
40 | Millbrae | SFO | 300 |
41 | Millbrae | San Bruno | 420 |
42 | Milpitas | Warm Springs | 540 |
43 | Montgomery Street | Powell Street | 120 |
44 | North Concord | Pittsburg | 360 |
45 | Orinda | Rockridge | 300 |
46 | Pittsburg | Pittsburg Center | 600 |
47 | Pleasant Hill | Walnut Creek | 120 |
48 | SFO | San Bruno | 240 |
49 | San Bruno | South San Francisco | 240 |
50 | South Hayward | Union City | 300 |
Simple Machine Learning Code to Predict Travel TimesĀ¶
InĀ [6]:
# Merge data for feature engineering
# Merge travel times with stations to get coordinates for both station_1 and station_2
merged_df = travel_times_df.merge(
stations_df, left_on='station_1', right_on='station', how='left'
).rename(columns={'latitude': 'latitude_1', 'longitude': 'longitude_1', 'transfer_time': 'transfer_time_1'}).drop(columns='station')
merged_df = merged_df.merge(
stations_df, left_on='station_2', right_on='station', how='left'
).rename(columns={'latitude': 'latitude_2', 'longitude': 'longitude_2', 'transfer_time': 'transfer_time_2'}).drop(columns='station')
InĀ [7]:
# Feature Engineering - Calculate Euclidean distance and add transfer times as features
merged_df['distance'] = np.sqrt(
(merged_df['latitude_1'] - merged_df['latitude_2'])**2 + (merged_df['longitude_1'] - merged_df['longitude_2'])**2
merged_df['total_transfer_time'] = merged_df['transfer_time_1'] + merged_df['transfer_time_2']
InĀ [8]:
# Define features and target variable
X = merged_df[['distance', 'total_transfer_time']]
y = merged_df['travel_time']
InĀ [9]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
InĀ [10]:
# Train a simple linear regression model
model = LinearRegression(), y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
InĀ [11]:
# Predict on the test set
y_pred = model.predict(X_test)
array([163.13177687, 178.4070064 , 289.71760209, 283.03456263, 285.32395436, 476.19269994, 159.65832726, 310.26528824, 177.63457537, 156.98810855, 284.58260386])
InĀ [12]:
# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae, r2
(np.float64(87.09448907918099), 0.4594084785172443)
Recommendations for Feature Engineering and Model ImprovementĀ¶
InĀ [13]:
# Train a Random Forest model as a more complex alternative to Linear Regression
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42), y_train)
RandomForestRegressor(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
InĀ [14]:
# Predict on the test set
y_pred_rf = random_forest_model.predict(X_test)
array([166.2, 158.4, 312. , 345.6, 328.2, 480.6, 156.6, 299.4, 167.4, 150.6, 292.2])
InĀ [15]:
# Evaluate the Random Forest model's performance
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf, r2_rf
(np.float64(87.38181818181819), 0.4768515513126492)
InĀ [16]:
# Merging and feature engineering as before
merged_df = travel_times_df.merge(
stations_df, left_on='station_1', right_on='station', how='left'
).rename(columns={'latitude': 'latitude_1', 'longitude': 'longitude_1', 'transfer_time': 'transfer_time_1'}).drop(columns='station')
merged_df = merged_df.merge(
stations_df, left_on='station_2', right_on='station', how='left'
).rename(columns={'latitude': 'latitude_2', 'longitude': 'longitude_2', 'transfer_time': 'transfer_time_2'}).drop(columns='station')
InĀ [17]:
# Calculate distance and total transfer time as features
merged_df['distance'] = np.sqrt(
(merged_df['latitude_1'] - merged_df['latitude_2'])**2 + (merged_df['longitude_1'] - merged_df['longitude_2'])**2
merged_df['total_transfer_time'] = merged_df['transfer_time_1'] + merged_df['transfer_time_2']
# Simulate peak/off-peak times
def simulate_peak_hours():
hour = random.choice(range(24))
return 1 if (7 <= hour <= 10) or (16 <= hour <= 19) else 0
merged_df['peak_hour'] = merged_df.apply(lambda x: simulate_peak_hours(), axis=1)
InĀ [18]:
station_1 | station_2 | travel_time | latitude_1 | longitude_1 | transfer_time_1 | latitude_2 | longitude_2 | transfer_time_2 | distance | total_transfer_time | peak_hour | |
0 | 12th Street | 19th Street | 120 | 37.803608 | -122.272006 | 282 | 37.807869 | -122.268980 | 67 | 0.005226 | 349 | 1 |
1 | 12th Street | Lake Merritt | 180 | 37.803608 | -122.272006 | 282 | 37.797773 | -122.266588 | 309 | 0.007963 | 591 | 0 |
2 | 12th Street | West Oakland | 300 | 37.803608 | -122.272006 | 282 | 37.804900 | -122.295100 | 283 | 0.023130 | 565 | 1 |
3 | 16th Street Mission | 24th Street Mission | 120 | 37.764847 | -122.420042 | 287 | 37.752000 | -122.418700 | 277 | 0.012917 | 564 | 0 |
4 | 16th Street Mission | Civic Center | 180 | 37.764847 | -122.420042 | 287 | 37.779861 | -122.413498 | 325 | 0.016378 | 612 | 0 |
5 | 19th Street | MacArthur | 180 | 37.807869 | -122.268980 | 67 | 37.828260 | -122.267275 | 59 | 0.020462 | 126 | 0 |
6 | 24th Street Mission | Glen Park | 180 | 37.752000 | -122.418700 | 277 | 37.733118 | -122.433808 | 314 | 0.024182 | 591 | 0 |
7 | Antioch | Pittsburg Center | 420 | 37.996281 | -121.783404 | 0 | 38.018227 | -121.890178 | 0 | 0.109006 | 0 | 1 |
8 | Ashby | Downtown Berkeley | 180 | 37.853068 | -122.269957 | 299 | 37.869799 | -122.268197 | 323 | 0.016823 | 622 | 1 |
9 | Ashby | MacArthur | 240 | 37.853068 | -122.269957 | 299 | 37.828260 | -122.267275 | 59 | 0.024953 | 358 | 0 |
10 | Balboa Park | Daly City | 240 | 37.721667 | -122.447500 | 48 | 37.706224 | -122.468934 | 285 | 0.026418 | 333 | 1 |
11 | Balboa Park | Glen Park | 120 | 37.721667 | -122.447500 | 48 | 37.733118 | -122.433808 | 314 | 0.017849 | 362 | 0 |
12 | Bay Fair | Castro Valley | 240 | 37.697000 | -122.126500 | 63 | 37.690748 | -122.075679 | 0 | 0.051204 | 63 | 0 |
13 | Bay Fair | Hayward | 240 | 37.697000 | -122.126500 | 63 | 37.669700 | -122.087000 | 284 | 0.048016 | 347 | 1 |
14 | Bay Fair | San Leandro | 240 | 37.697000 | -122.126500 | 63 | 37.721764 | -122.160684 | 298 | 0.042211 | 361 | 0 |
15 | Berryessa | Milpitas | 300 | 37.368361 | -121.874655 | 288 | 37.410278 | -121.891111 | 292 | 0.045031 | 580 | 0 |
16 | Castro Valley | West Dublin | 600 | 37.690748 | -122.075679 | 0 | 37.699726 | -121.928273 | 0 | 0.147679 | 0 | 0 |
17 | Civic Center | Powell Street | 60 | 37.779861 | -122.413498 | 325 | 37.784000 | -122.408000 | 286 | 0.006882 | 611 | 1 |
18 | Coliseum | Fruitvale | 240 | 37.753611 | -122.196944 | 54 | 37.774800 | -122.224100 | 279 | 0.034444 | 333 | 1 |
19 | Coliseum | OAK | 480 | 37.753611 | -122.196944 | 54 | 37.713200 | -122.212200 | 0 | 0.043195 | 54 | 0 |
20 | Coliseum | San Leandro | 240 | 37.753611 | -122.196944 | 54 | 37.721764 | -122.160684 | 298 | 0.048260 | 352 | 1 |
21 | Colma | Daly City | 240 | 37.684722 | -122.466111 | 276 | 37.706224 | -122.468934 | 285 | 0.021687 | 561 | 0 |
22 | Colma | South San Francisco | 180 | 37.684722 | -122.466111 | 276 | 37.664264 | -122.444043 | 316 | 0.030092 | 592 | 1 |
23 | Concord | North Concord | 180 | 37.973745 | -122.029127 | 0 | 38.003273 | -122.024599 | 0 | 0.029873 | 0 | 0 |
24 | Concord | Pleasant Hill | 360 | 37.973745 | -122.029127 | 0 | 37.928399 | -122.055992 | 0 | 0.052707 | 0 | 1 |
25 | Downtown Berkeley | North Berkeley | 120 | 37.869799 | -122.268197 | 323 | 37.873915 | -122.282552 | 289 | 0.014933 | 612 | 0 |
26 | Dublin | West Dublin | 180 | 37.701663 | -121.899232 | 0 | 37.699726 | -121.928273 | 0 | 0.029106 | 0 | 0 |
27 | El Cerrito Plaza | El Cerrito del Norte | 180 | 37.902694 | -122.298968 | 280 | 37.925183 | -122.316939 | 311 | 0.028787 | 591 | 0 |
28 | El Cerrito Plaza | North Berkeley | 180 | 37.902694 | -122.298968 | 280 | 37.873915 | -122.282552 | 289 | 0.033132 | 569 | 0 |
29 | El Cerrito del Norte | Richmond | 300 | 37.925183 | -122.316939 | 311 | 37.936811 | -122.353095 | 302 | 0.037980 | 613 | 1 |
30 | Embarcadero | Montgomery Street | 60 | 37.793056 | -122.397222 | 304 | 37.789355 | -122.401942 | 313 | 0.005998 | 617 | 0 |
31 | Embarcadero | West Oakland | 420 | 37.793056 | -122.397222 | 304 | 37.804900 | -122.295100 | 283 | 0.102807 | 587 | 0 |
32 | Fremont | Union City | 300 | 37.557489 | -121.976620 | 306 | 37.590746 | -122.017282 | 293 | 0.052530 | 599 | 0 |
33 | Fremont | Warm Springs | 360 | 37.557489 | -121.976620 | 306 | 37.503000 | -121.940000 | 278 | 0.065651 | 584 | 0 |
34 | Fruitvale | Lake Merritt | 300 | 37.774800 | -122.224100 | 279 | 37.797773 | -122.266588 | 309 | 0.048301 | 588 | 0 |
35 | Hayward | South Hayward | 240 | 37.669700 | -122.087000 | 284 | 37.634362 | -122.057172 | 319 | 0.046244 | 603 | 1 |
36 | Lafayette | Orinda | 300 | 37.893186 | -122.124614 | 0 | 37.878427 | -122.183740 | 0 | 0.060940 | 0 | 1 |
37 | Lafayette | Walnut Creek | 300 | 37.893186 | -122.124614 | 0 | 37.905724 | -122.067332 | 0 | 0.058638 | 0 | 0 |
38 | Lake Merritt | West Oakland | 360 | 37.797773 | -122.266588 | 309 | 37.804900 | -122.295100 | 283 | 0.029389 | 592 | 0 |
39 | MacArthur | Rockridge | 240 | 37.828260 | -122.267275 | 59 | 37.844452 | -122.252083 | 0 | 0.022203 | 59 | 0 |
40 | Millbrae | SFO | 300 | 37.600300 | -122.386700 | 0 | 37.616400 | -122.391000 | 291 | 0.016664 | 291 | 0 |
41 | Millbrae | San Bruno | 420 | 37.600300 | -122.386700 | 0 | 37.638300 | -122.416500 | 307 | 0.048291 | 307 | 1 |
42 | Milpitas | Warm Springs | 540 | 37.410278 | -121.891111 | 292 | 37.503000 | -121.940000 | 278 | 0.104821 | 570 | 1 |
43 | Montgomery Street | Powell Street | 120 | 37.789355 | -122.401942 | 313 | 37.784000 | -122.408000 | 286 | 0.008086 | 599 | 0 |
44 | North Concord | Pittsburg | 360 | 38.003273 | -122.024599 | 0 | 38.018869 | -121.944208 | 0 | 0.081890 | 0 | 0 |
45 | Orinda | Rockridge | 300 | 37.878427 | -122.183740 | 0 | 37.844452 | -122.252083 | 0 | 0.076322 | 0 | 1 |
46 | Pittsburg | Pittsburg Center | 600 | 38.018869 | -121.944208 | 0 | 38.018227 | -121.890178 | 0 | 0.054034 | 0 | 0 |
47 | Pleasant Hill | Walnut Creek | 120 | 37.928399 | -122.055992 | 0 | 37.905724 | -122.067332 | 0 | 0.025353 | 0 | 0 |
48 | SFO | San Bruno | 240 | 37.616400 | -122.391000 | 291 | 37.638300 | -122.416500 | 307 | 0.033613 | 598 | 0 |
49 | San Bruno | South San Francisco | 240 | 37.638300 | -122.416500 | 307 | 37.664264 | -122.444043 | 316 | 0.037852 | 623 | 0 |
50 | South Hayward | Union City | 300 | 37.634362 | -122.057172 | 319 | 37.590746 | -122.017282 | 293 | 0.059106 | 612 | 0 |
InĀ [19]:
# Define features and target variable with the additional peak hour indicator
X = merged_df[['distance', 'total_transfer_time', 'peak_hour']]
y = merged_df['travel_time']
# Split the data again for the new features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
InĀ [20]:
# Train the Random Forest model with additional features
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42), y_train)
RandomForestRegressor(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
InĀ [21]:
# Predict and evaluate the model
y_pred_rf = random_forest_model.predict(X_test)
mae_rf_peak = mean_absolute_error(y_test, y_pred_rf)
r2_rf_peak = r2_score(y_test, y_pred_rf)
mae_rf_peak, r2_rf_peak
(np.float64(88.14545454545454), 0.4764787589498808)
InĀ [22]:
# Define an accuracy threshold to classify regression predictions as accurate (1) or inaccurate (0)
# Here, any prediction within the threshold is considered accurate
accuracy_threshold = 3.0 # Adjust threshold as needed
# Hypothetical continuous predictions for each model
# Converting continuous predictions to binary classification based on threshold
y_pred_continuous_lr = [2.5, 3.1, 4.2, 2.9, 3.8, 1.1, 3.5, 2.8, 4.0, 3.3] # Preds for Linear Regression
y_pred_continuous_rf = [1.9, 3.0, 4.1, 2.6, 3.4, 0.9, 3.1, 2.4, 4.3, 3.2] # Preds for Random Forest
y_pred_continuous_rf_enhanced = [2.0, 3.2, 3.8, 2.5, 3.5, 1.0, 3.0, 2.7, 4.1, 3.3] # Preds for Enhanced RF
# Converting to binary classification (1 = accurate, 0 = inaccurate)
y_test_binary = [1, 1, 0, 0, 1, 1, 0, 1, 0, 1]
y_pred_binary_lr = [int(abs(pred - true) <= accuracy_threshold) for pred, true in zip(y_pred_continuous_lr, y_test_binary)]
y_pred_binary_rf = [int(abs(pred - true) <= accuracy_threshold) for pred, true in zip(y_pred_continuous_rf, y_test_binary)]
y_pred_binary_rf_enhanced = [int(abs(pred - true) <= accuracy_threshold) for pred, true in zip(y_pred_continuous_rf_enhanced, y_test_binary)]
# Calculating metrics
metrics = {
'Algorithm': ['Linear Regression', 'Random Forest', 'Random Forest (Enhanced)'],
'Precision': [
precision_score(y_test_binary, y_pred_binary_lr),
precision_score(y_test_binary, y_pred_binary_rf),
precision_score(y_test_binary, y_pred_binary_rf_enhanced)
'Recall': [
recall_score(y_test_binary, y_pred_binary_lr),
recall_score(y_test_binary, y_pred_binary_rf),
recall_score(y_test_binary, y_pred_binary_rf_enhanced)
'F1 Score': [
f1_score(y_test_binary, y_pred_binary_lr),
f1_score(y_test_binary, y_pred_binary_rf),
f1_score(y_test_binary, y_pred_binary_rf_enhanced)
'R2 Score': [
r2_score(y_test_binary, y_pred_continuous_lr),
r2_score(y_test_binary, y_pred_continuous_rf),
r2_score(y_test_binary, y_pred_continuous_rf_enhanced)
'MSE': [
mean_absolute_error(y_test_binary, y_pred_continuous_lr),
mean_absolute_error(y_test_binary, y_pred_continuous_rf),
mean_absolute_error(y_test_binary, y_pred_continuous_rf_enhanced)
# Convert to DataFrame for display and further plotting
metrics_df = pd.DataFrame(metrics)
InĀ [23]:
Algorithm | Precision | Recall | F1 Score | R2 Score | MSE | |
0 | Linear Regression | 0.857143 | 1.0 | 0.923077 | -31.225000 | 2.52 |
1 | Random Forest | 0.857143 | 1.0 | 0.923077 | -27.770833 | 2.31 |
2 | Random Forest (Enhanced) | 0.750000 | 1.0 | 0.857143 | -26.820833 | 2.31 |
InĀ [24]:
# Plotting the updated classification metrics for each algorithm
# Precision
plt.figure(figsize=(10, 6))['Algorithm'], metrics_df['Precision'])
plt.title('Algorithm Performance Comparison - Precision')
# Recall
plt.figure(figsize=(10, 6))['Algorithm'], metrics_df['Recall'])
plt.title('Algorithm Performance Comparison - Recall')
# F1 Score
plt.figure(figsize=(10, 6))['Algorithm'], metrics_df['F1 Score'])
plt.ylabel('F1 Score')
plt.title('Algorithm Performance Comparison - F1 Score')
# R2 Score
plt.figure(figsize=(10, 6))['Algorithm'], metrics_df['R2 Score'])
plt.ylabel('R2 Score')
plt.title('Algorithm Performance Comparison - R2 Score')
plt.figure(figsize=(10, 6))['Algorithm'], metrics_df['MSE'])
plt.title('Algorithm Performance Comparison - MSE')
InĀ [27]:
# Repeat data enrichment for feature engineering
# Step 1: Enrich travel times with station coordinates and transfer times by merging with the stations data
enriched_travel_times = travel_times_df.merge(stations_df, left_on='station_1', right_on='station', suffixes=('', '_1'))
enriched_travel_times = enriched_travel_times.rename(columns={
'latitude': 'latitude_1', 'longitude': 'longitude_1', 'transfer_time': 'transfer_time_1'
enriched_travel_times = enriched_travel_times.merge(stations_df, left_on='station_2', right_on='station', suffixes=('', '_2'))
enriched_travel_times = enriched_travel_times.rename(columns={
'latitude': 'latitude_2', 'longitude': 'longitude_2', 'transfer_time': 'transfer_time_2'
enriched_travel_times = enriched_travel_times.merge(lines_df, left_on='station_1', right_on='station', how='left')
enriched_travel_times = enriched_travel_times.rename(columns={'line': 'line_1', 'sequence': 'sequence_1'}).drop(columns=['station'])
enriched_travel_times = enriched_travel_times.merge(lines_df, left_on='station_2', right_on='station', how='left')
enriched_travel_times = enriched_travel_times.rename(columns={'line': 'line_2', 'sequence': 'sequence_2'}).drop(columns=['station'])
# Create final aggregated dataset with additional features
aggregated_data = enriched_travel_times[[
'station_1', 'station_2', 'travel_time',
'latitude_1', 'longitude_1', 'transfer_time_1',
'latitude_2', 'longitude_2', 'transfer_time_2',
'sequence_1', 'sequence_2'
aggregated_data['distance'] = ((aggregated_data['latitude_2'] - aggregated_data['latitude_1'])**2 +
(aggregated_data['longitude_2'] - aggregated_data['longitude_1'])**2)**0.5
aggregated_data['lat_diff'] = aggregated_data['latitude_2'] - aggregated_data['latitude_1']
aggregated_data['lon_diff'] = aggregated_data['longitude_2'] - aggregated_data['longitude_1']
aggregated_data['peak_indicator'] = (aggregated_data['travel_time'] > aggregated_data['travel_time'].median()).astype(int)
# Adding anomaly detection using Isolation Forest
from sklearn.ensemble import IsolationForest
anomaly_detector = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)[['distance', 'transfer_time_1', 'transfer_time_2', 'lat_diff', 'lon_diff', 'peak_indicator']])
aggregated_data['anomaly'] = anomaly_detector.predict(aggregated_data[['distance', 'transfer_time_1', 'transfer_time_2', 'lat_diff', 'lon_diff', 'peak_indicator']])
# Now that we have re-created the data, proceed with the visualizations
plt.figure(figsize=(10, 6))
plt.scatter(aggregated_data['distance'], aggregated_data['travel_time'],
c=aggregated_data['peak_indicator'], cmap='coolwarm', alpha=0.7)
plt.colorbar(label='Peak Indicator')
plt.xlabel('Distance (Euclidean between stations)')
plt.ylabel('Travel Time')
plt.title('Travel Time vs Distance with Peak Indicator')
plt.figure(figsize=(10, 6))
plt.scatter(aggregated_data['distance'], aggregated_data['travel_time'],
c=aggregated_data['anomaly'], cmap='bwr', alpha=0.7)
plt.colorbar(label='Anomaly Detection (-1: Anomaly, 1: Normal)')
plt.xlabel('Distance (Euclidean between stations)')
plt.ylabel('Travel Time')
plt.title('Anomaly Detection in Travel Times')
InĀ [2]:
import folium
import time
from IPython.display import display
# Initial coordinates (e.g., starting location of the robot)
start_location = [37.7749, -122.4194] # Replace with desired starting latitude and longitude
# List of coordinates to simulate movement along a path
path = [
[37.7749, -122.4194],
[37.7755, -122.4185],
[37.7760, -122.4170],
[37.7770, -122.4150],
[37.7780, -122.4130] # Add more points for longer routes
# Create the initial map centered on the starting location
m = folium.Map(location=start_location, zoom_start=15)
# Function to add a marker that represents the current location of the delivery robot
def update_marker(location):
folium.Marker(location=location, popup="Delivery Robot", icon=folium.Icon(color="red")).add_to(m)
# Simulate the movement along the path by updating the marker location every few seconds
for location in path:
time.sleep(2) # Wait 2 seconds between updates to simulate movement
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook