In [1]:
!pip install py2neo networkx matplotlib plotly kaleido
Requirement already satisfied: py2neo in /opt/conda/lib/python3.9/site-packages (2021.2.4)
Requirement already satisfied: networkx in /opt/conda/lib/python3.9/site-packages (2.6.3)
Requirement already satisfied: matplotlib in /opt/conda/lib/python3.9/site-packages (3.4.3)
Requirement already satisfied: plotly in /opt/conda/lib/python3.9/site-packages (5.24.1)
Collecting kaleido
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
|████████████████████████████████| 79.9 MB 79.6 MB/s eta 0:00:01
Requirement already satisfied: pansi>=2020.7.3 in /opt/conda/lib/python3.9/site-packages (from py2neo) (2024.11.0)
Requirement already satisfied: certifi in /opt/conda/lib/python3.9/site-packages (from py2neo) (2021.10.8)
Requirement already satisfied: monotonic in /opt/conda/lib/python3.9/site-packages (from py2neo) (1.6)
Requirement already satisfied: pygments>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from py2neo) (2.10.0)
Requirement already satisfied: interchange~=2021.0.4 in /opt/conda/lib/python3.9/site-packages (from py2neo) (2021.0.4)
Requirement already satisfied: urllib3 in /opt/conda/lib/python3.9/site-packages (from py2neo) (1.26.7)
Requirement already satisfied: six>=1.15.0 in /opt/conda/lib/python3.9/site-packages (from py2neo) (1.16.0)
Requirement already satisfied: packaging in /opt/conda/lib/python3.9/site-packages (from py2neo) (21.0)
Requirement already satisfied: numpy>=1.16 in /opt/conda/lib/python3.9/site-packages (from matplotlib) (1.20.3)
Requirement already satisfied: pyparsing>=2.2.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib) (3.0.4)
Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.9/site-packages (from matplotlib) (0.10.0)
Requirement already satisfied: pillow>=6.2.0 in /opt/conda/lib/python3.9/site-packages (from matplotlib) (8.4.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.9/site-packages (from matplotlib) (1.3.1)
Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.9/site-packages (from matplotlib) (2.8.2)
Requirement already satisfied: tenacity>=6.2.0 in /opt/conda/lib/python3.9/site-packages (from plotly) (9.0.0)
Requirement already satisfied: pytz in /opt/conda/lib/python3.9/site-packages (from interchange~=2021.0.4->py2neo) (2021.3)
Installing collected packages: kaleido
Successfully installed kaleido-0.2.1
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
In [2]:
import neo4j
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from py2neo import Graph
from IPython.display import display, IFrame
In [3]:
# Connect to Neo4j
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")
In [4]:
# Load Dataframes
stations_df = pd.read_csv('data/stations.csv')
travel_times_df = pd.read_csv('data/travel_times.csv')
lines_df = pd.read_csv('data/lines.csv')
Build the Graph¶
In [5]:
# Define functions
def my_neo4j_wipe_out_database():
"wipe out database by deleting all nodes and relationships"
query = "match (node)-[relationship]->() delete node, relationship"
session.run(query)
query = "match (node) delete node"
session.run(query)
def generate_neo4j_query_from_data(stations_df, travel_times_df):
# Start the CREATE statement
query = "CREATE\n"
# Step 1: Add nodes for each station
station_lines = []
for _, row in stations_df.iterrows():
# Use the station name as a Neo4j node identifier and replace spaces with underscores for compatibility
station_identifier = row['station'].replace(' ', '_')
# Prefix identifier with an underscore if it starts with a digit
if station_identifier[0].isdigit():
station_identifier = f"_{station_identifier}"
station_line = (
f" ({station_identifier}:Station {{name: '{row['station']}', "
f"latitude: {row['latitude']}, longitude: {row['longitude']}, "
f"transfer_time: {row['transfer_time']}}})"
)
station_lines.append(station_line)
# Step 2: Add relationships based on travel times
relationship_lines = []
for _, row in travel_times_df.iterrows():
# Use the station names as identifiers, replacing spaces with underscores
station1 = row['station_1'].replace(' ', '_')
station2 = row['station_2'].replace(' ', '_')
# Prefix identifiers with an underscore if they start with a digit
if station1[0].isdigit():
station1 = f"_{station1}"
if station2[0].isdigit():
station2 = f"_{station2}"
relationship_line = (
f" ({station1})-[:TRACK {{travel_time: {row['travel_time']}}}]->({station2})"
)
relationship_lines.append(relationship_line)
# Assuming undirected track, we add reverse as well
reverse_relationship_line = (
f" ({station2})-[:TRACK {{travel_time: {row['travel_time']}}}]->({station1})"
)
relationship_lines.append(reverse_relationship_line)
# Combine all lines into the final query string
query += ",\n".join(station_lines + relationship_lines) + "\n"
return query
def my_create_connected_graph():
"""
Create a connected graph using stations and travel times data.
Ensures the 'station_graph' projection is dropped if it exists, then creates it with fresh data.
"""
# Step 1: Drop any existing 'station_graph' projection if it exists
drop_projection_query = """
CALL gds.graph.exists('station_graph') YIELD exists
WHERE exists
CALL gds.graph.drop('station_graph') YIELD graphName
RETURN graphName;
"""
session.run(drop_projection_query)
print("Checked and dropped any existing 'station_graph' projection if it existed.")
# Step 2: Wipe out the database
my_neo4j_wipe_out_database() # Clears the database completely
print("Database cleared successfully.")
# Step 3: Generate and run the CREATE query to add nodes and relationships
query = generate_neo4j_query_from_data(stations_df, travel_times_df)
session.run(query) # Run the query in a Neo4j session to add nodes and relationships
print("Nodes and relationships created successfully.")
# Step 4: Create a new 'station_graph' projection for GDS algorithms
create_projection_query = """
CALL gds.graph.project(
'station_graph',
'Station',
{
TRACK: {
type: 'TRACK',
orientation: 'UNDIRECTED',
properties: 'travel_time'
}
}
)
"""
session.run(create_projection_query)
print("Graph projection 'station_graph' created successfully.")
# Step 5: Verify that the projection was created successfully
result = session.run("CALL gds.graph.exists('station_graph') YIELD exists")
if result.single()['exists']:
print("Projection 'station_graph' verified to exist.")
else:
print("Error: Projection 'station_graph' was not created successfully.")
def my_neo4j_run_query_pandas(query, **kwargs):
"run a query and return the results in a pandas dataframe"
result = session.run(query, **kwargs)
df = pd.DataFrame([r.values() for r in result], columns=result.keys())
return df
def my_neo4j_nodes_relationships():
"print all the nodes and relationships"
print("-------------------------")
print(" Nodes:")
print("-------------------------")
query = """
match (n)
return n.name as node_name, labels(n) as labels
order by n.name
"""
df = my_neo4j_run_query_pandas(query)
number_nodes = df.shape[0]
display(df)
print("-------------------------")
print(" Relationships:")
print("-------------------------")
query = """
match (n1)-[r]->(n2)
return n1.name as node_name_1, labels(n1) as node_1_labels,
type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
order by node_name_1, node_name_2
"""
df = my_neo4j_run_query_pandas(query)
number_relationships = df.shape[0]
display(df)
density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
print("-------------------------")
print(" Density:", f'{density:.1f}')
print("-------------------------")
# PageRank query on the 'station_graph'
def run_page_rank():
"""
Run the PageRank algorithm on the 'station_graph' projection.
"""
page_rank_query = """
CALL gds.pageRank.stream('station_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS station, score
ORDER BY score DESC
"""
page_rank_df = my_neo4j_run_query_pandas(page_rank_query)
return page_rank_df
In [6]:
# Step 1: Create the graph structure
my_create_connected_graph()
# Parameters for PageRank
max_iterations = 20
damping_factor = 0.05
Checked and dropped any existing 'station_graph' projection if it existed. Database cleared successfully. Nodes and relationships created successfully. Graph projection 'station_graph' created successfully. Projection 'station_graph' verified to exist.
In [7]:
# Step 4: Run PageRank
pagerank_df = run_page_rank()
In [8]:
# Print all the nodes and relationships
my_neo4j_nodes_relationships()
------------------------- Nodes: -------------------------
node_name | labels | |
---|---|---|
0 | 12th Street | [Station] |
1 | 16th Street Mission | [Station] |
2 | 19th Street | [Station] |
3 | 24th Street Mission | [Station] |
4 | Antioch | [Station] |
5 | Ashby | [Station] |
6 | Balboa Park | [Station] |
7 | Bay Fair | [Station] |
8 | Berryessa | [Station] |
9 | Castro Valley | [Station] |
10 | Civic Center | [Station] |
11 | Coliseum | [Station] |
12 | Colma | [Station] |
13 | Concord | [Station] |
14 | Daly City | [Station] |
15 | Downtown Berkeley | [Station] |
16 | Dublin | [Station] |
17 | El Cerrito Plaza | [Station] |
18 | El Cerrito del Norte | [Station] |
19 | Embarcadero | [Station] |
20 | Fremont | [Station] |
21 | Fruitvale | [Station] |
22 | Glen Park | [Station] |
23 | Hayward | [Station] |
24 | Lafayette | [Station] |
25 | Lake Merritt | [Station] |
26 | MacArthur | [Station] |
27 | Millbrae | [Station] |
28 | Milpitas | [Station] |
29 | Montgomery Street | [Station] |
30 | North Berkeley | [Station] |
31 | North Concord | [Station] |
32 | OAK | [Station] |
33 | Orinda | [Station] |
34 | Pittsburg | [Station] |
35 | Pittsburg Center | [Station] |
36 | Pleasant Hill | [Station] |
37 | Powell Street | [Station] |
38 | Richmond | [Station] |
39 | Rockridge | [Station] |
40 | SFO | [Station] |
41 | San Bruno | [Station] |
42 | San Leandro | [Station] |
43 | South Hayward | [Station] |
44 | South San Francisco | [Station] |
45 | Union City | [Station] |
46 | Walnut Creek | [Station] |
47 | Warm Springs | [Station] |
48 | West Dublin | [Station] |
49 | West Oakland | [Station] |
------------------------- Relationships: -------------------------
node_name_1 | node_1_labels | relationship_type | node_name_2 | node_2_labels | |
---|---|---|---|---|---|
0 | 12th Street | [Station] | TRACK | 19th Street | [Station] |
1 | 12th Street | [Station] | TRACK | Lake Merritt | [Station] |
2 | 12th Street | [Station] | TRACK | West Oakland | [Station] |
3 | 16th Street Mission | [Station] | TRACK | 24th Street Mission | [Station] |
4 | 16th Street Mission | [Station] | TRACK | Civic Center | [Station] |
... | ... | ... | ... | ... | ... |
97 | West Dublin | [Station] | TRACK | Castro Valley | [Station] |
98 | West Dublin | [Station] | TRACK | Dublin | [Station] |
99 | West Oakland | [Station] | TRACK | 12th Street | [Station] |
100 | West Oakland | [Station] | TRACK | Embarcadero | [Station] |
101 | West Oakland | [Station] | TRACK | Lake Merritt | [Station] |
102 rows × 5 columns
------------------------- Density: 0.1 -------------------------
In [9]:
# Query to retrieve nodes and relationships using the correct relationship type
query = """
MATCH (s1:Station)-[r:TRACK]->(s2:Station)
RETURN s1.name AS station_1, s2.name AS station_2, r.travel_time AS travel_time
"""
results = session.run(query).data()
# Check if results are fetched correctly
if results:
# Create a NetworkX graph
G = nx.DiGraph() # Directed graph for routing
# Add nodes and edges from the Neo4j query results
for row in results:
G.add_edge(row['station_1'], row['station_2'], weight=row['travel_time'])
# Draw the graph
plt.figure(figsize=(16, 12))
pos = nx.spring_layout(G, seed=42) # Position nodes for better visualization
nx.draw(G, pos, with_labels=True, node_size=500, font_size=10, arrowsize=20)
nx.draw_networkx_edge_labels(G, pos, edge_labels={(row['station_1'], row['station_2']): row['travel_time'] for row in results})
plt.title("BART Station Connectivity Graph")
plt.show()
else:
print("No data found for the specified relationship type.")
Draw a Graph¶
In [10]:
# Step 1: Create a NetworkX graph and add nodes and edges from the Neo4j query results
G = nx.DiGraph() # Directed graph for routing
for row in results:
G.add_edge(row['station_1'], row['station_2'], weight=row['travel_time'])
# Step 2: Rank stations by connectivity using degree centrality
degree_centrality = nx.degree_centrality(G)
nx.set_node_attributes(G, degree_centrality, 'degree_centrality')
# Step 3: Identify and mark delivery hubs (top 10% most connected nodes)
hub_threshold = sorted(degree_centrality.values(), reverse=True)[int(len(degree_centrality) * 0.1)]
delivery_hubs = [node for node, centrality in degree_centrality.items() if centrality >= hub_threshold]
# Step 4: Identify multi-route connections (nodes with multiple neighbors)
multi_route_stations = [node for node, degree in G.degree if degree > 1]
# Step 5: Set up Plotly for an interactive visualization
pos = nx.spring_layout(G, seed=42) # Layout for the node positions
In [11]:
# Create Plotly nodes with color coding for special markers
node_trace = go.Scatter(
x=[pos[node][0] for node in G.nodes()],
y=[pos[node][1] for node in G.nodes()],
mode='markers+text',
marker=dict(
size=[20 if node in delivery_hubs else 10 for node in G.nodes()],
color=['red' if node in delivery_hubs else 'blue' for node in G.nodes()],
opacity=0.8,
),
text=[f"{node}<br>Centrality: {degree_centrality[node]:.2f}" for node in G.nodes()],
hoverinfo='text'
)
# Create Plotly edges
edge_trace = go.Scatter(
x=[],
y=[],
line=dict(width=0.5, color='#888'),
hoverinfo='none',
mode='lines'
)
for edge in G.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_trace['x'] += (x0, x1, None)
edge_trace['y'] += (y0, y1, None)
In [12]:
# Set up Plotly layout
layout = go.Layout(
title='BART Station Connectivity Graph - Optimized for Delivery',
showlegend=False,
hovermode='closest',
margin=dict(b=0, l=0, r=0, t=40),
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
)
# Combine nodes and edges
fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
In [13]:
# Step 6: Save as HTML and PNG
fig.write_html("bart_station_connectivity_graph.html") # Save as interactive HTML
fig.write_image("bart_station_connectivity_graph.png") # Save as PNG
# Display the interactive HTML in the notebook
display(IFrame("bart_station_connectivity_graph.html", width=900, height=600))
Drop Graph¶
In [14]:
# Step 5: Drop Graph Projection
query_drop_projection = "CALL gds.graph.drop('station_graph');"
my_neo4j_run_query_pandas(query_drop_projection)
# Close the driver after use
driver.close()
Received notification from DBMS server: {severity: WARNING} {code: Neo.ClientNotification.Statement.FeatureDeprecationWarning} {category: DEPRECATION} {title: This feature is deprecated and will be removed in future versions.} {description: The query used a deprecated field from a procedure. ('schema' returned by 'gds.graph.drop' is deprecated.)} {position: line: 1, column: 1, offset: 0} for query: "CALL gds.graph.drop('station_graph');"