import warnings

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb
from scipy import stats

warnings.filterwarnings("ignore")

sb.set_theme()

DOWNLOAD_FRESH_DATA = True
data_file = "data.csv"

if DOWNLOAD_FRESH_DATA:
    from pathlib import Path

    import requests
    url = Path("url.txt").read_text()
    print("downloading fresh data")
    resp = requests.get(url)
    resp.raise_for_status()
    Path(data_file).write_text(resp.text)


df = pd.read_csv(data_file)

display(df.head())
display(df.describe())

downloading fresh data

df.Timestamp = pd.to_datetime(df.Timestamp)
df["morning"] = df.Timestamp.dt.hour < 12
df["hour"] = df.Timestamp.dt.hour + df.Timestamp.dt.minute/60 + df.Timestamp.dt.second/3600
df["day"] = df.Timestamp.dt.date
df.head()

# ensure no more errant spaces break the notebook
df.Event = df.Event.str.strip()

from itertools import product


g = df.set_index(["morning", "Event"])
for index in product([True, False], ["Enter", "Exit"]):
    display(index)
    display(g.loc[index].hour.describe())

(True, 'Enter')

count    110.000000
mean       7.176369
std        0.570877
min        5.168333
25%        6.940417
50%        7.150278
75%        7.448750
max        9.043056
Name: hour, dtype: float64

(True, 'Exit')

count    110.000000
mean       7.697856
std        0.594249
min        5.620556
25%        7.443611
50%        7.689444
75%        7.938403
max        9.507222
Name: hour, dtype: float64

(False, 'Enter')

count    105.000000
mean      17.010833
std        0.955967
min       13.498889
25%       16.477500
50%       17.064444
75%       17.496389
max       20.045278
Name: hour, dtype: float64

(False, 'Exit')

count    105.000000
mean      17.595646
std        0.930724
min       14.117500
25%       17.053056
50%       17.675278
75%       18.100833
max       20.488889
Name: hour, dtype: float64

# reshape data
trip = df.pivot_table(index=["day", "morning"], columns="Event").reset_index()
# flatten columns
trip.columns = ["_".join(c) if c[1] else c[0] for c in trip.columns.to_flat_index()]
# calculate new information
trip["ridetime"] = (trip.hour_Exit - trip.hour_Enter) * 60.0
trip["Direction"] = trip.morning.map({True: "To work", False: "From work"})
# preview and save copy
display(trip.head())
trip.to_csv("trip.csv", index=False)

display(trip[trip.morning == True].ridetime.describe())
display(trip[trip.morning == False].ridetime.describe())

count    110.000000
mean      31.289242
std        3.410525
min       24.566667
25%       29.470833
50%       30.758333
75%       32.458333
max       46.833333
Name: ridetime, dtype: float64

count    105.000000
mean      35.088730
std        4.049188
min       25.466667
25%       32.166667
50%       34.800000
75%       37.133333
max       45.216667
Name: ridetime, dtype: float64

ax: plt.Axes = sb.histplot(data=trip, x="ridetime")
ax.set_xlabel("Ride length (min)")
ax.figure.tight_layout()

ax: plt.Axes = sb.histplot(data=trip, x="hour_Enter", hue="Direction", binwidth=0.5)
ax.set_xlabel("Enter time, 30 min bins")
ax.figure.tight_layout()

ax: plt.Axes = sb.boxplot(data=trip, x="ridetime", hue="Direction")
ax.figure.set_size_inches(w=10, h=4)
ax.set_xlabel("Ride length (min)")
ax.figure.tight_layout()

ax: plt.Axes = sb.lineplot(data=trip, x="Timestamp_Enter", y="ridetime", hue="Direction")
ax.figure.set_size_inches(w=14, h=4)
ax.set_xlabel("Date")
ax.set_ylabel("Ride length (min)")
ax.figure.tight_layout()

ride_to_work = trip[trip.morning == True].ridetime
ride_from_work = trip[trip.morning == False].ridetime
if ride_to_work.hasnans or ride_from_work.hasnans:
    print("NaNs found in ridetimes")

n = min(len(ride_to_work), len(ride_from_work))
print(f"{n=}")

result = stats.ttest_ind(ride_to_work.to_list()[:n], ride_from_work.to_list()[:n], equal_var=False)
print(result)
print(result.confidence_interval())

alpha = 0.01  # 99%
if result.pvalue < alpha:
    print("Reject H0 in favor of HA that average to-work and from-work ride times are not equal.")
else:
    print("Do not reject H0 that average to-work and from-work ride times are equal.")

n=105
TtestResult(statistic=np.float64(-7.222168120398091), pvalue=np.float64(1.0106150683807867e-11), df=np.float64(202.87965032649853))
ConfidenceInterval(low=np.float64(-4.772778079956935), high=np.float64(-2.7256346184557936))
Reject H0 in favor of HA that average to-work and from-work ride times are not equal.

	day	morning	Timestamp_Enter	Timestamp_Exit	hour_Enter	hour_Exit	ridetime	Direction
0	2024-07-16	False	2024-07-16 19:33:00	2024-07-16 20:05:00	19.550000	20.083333	32.000000	From work
1	2024-07-16	True	2024-07-16 08:01:00	2024-07-16 08:33:00	8.016667	8.550000	32.000000	To work
2	2024-07-17	False	2024-07-17 17:13:29	2024-07-17 17:49:28	17.224722	17.824444	35.983333	From work
3	2024-07-18	False	2024-07-18 17:30:37	2024-07-18 18:06:03	17.510278	18.100833	35.433333	From work
4	2024-07-18	True	2024-07-18 07:25:46	2024-07-18 07:56:16	7.429444	7.937778	30.500000	To work

Bus ride times¶

Get data¶

Destructure time¶

Transform¶

Plots¶

Distribution of all ride times¶

Distribution of ride times by direction¶

Box plot of ride times by direction¶

Time series of ride times by direction¶

T-Test of mean directional ride times¶

Conclusion¶

	Timestamp	Event
0	7/16/2024 8:01:00	Enter
1	7/16/2024 8:33:00	Exit
2	7/16/2024 19:33:00	Enter
3	7/16/2024 20:05:00	Exit
4	7/17/2024 17:13:29	Enter

	Timestamp	Event
count	430	430
unique	430	2
top	2/7/2025 18:35:46	Enter
freq	1	215