Imports¶
In [ ]:
import requests, io, zipfile, random
import geopandas as gpd
import pytz
import numpy as np
from datetime import datetime
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor, plot_importance, plot_tree as xg_plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
Data Acquisition & Preprocessing¶
In [ ]:
req = requests.get("https://github.com/evansiroky/timezone-boundary-builder/releases/download/2025b/timezones-with-oceans-now.geojson.zip", headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36"})
if req.status_code == 200:
zip_file = zipfile.ZipFile(io.BytesIO(req.content))
zip_file.extractall()
else:
print(f"Failed to download file. Status code: {req.status_code}")
In [ ]:
gdf = gpd.read_file("combined-with-oceans-now.json")
gdf.shape
Out[Â ]:
In [ ]:
gdf_projected = gdf.to_crs(epsg=3395)
gdf_projected["centroid"] = gdf_projected.geometry.centroid
gdf["centroid"] = gdf_projected["centroid"].to_crs(epsg=4326)
gdf["longitude"] = gdf["centroid"].x
gdf["latitude"] = gdf["centroid"].y
def get_utc_offset(tzid):
try:
if tzid.startswith("Etc/GMT"):
offset = float(tzid.split("Etc/GMT")[1])
return -offset
tz = pytz.timezone(tzid)
offset_timedelta = tz.utcoffset(datetime.now())
return offset_timedelta.total_seconds() / 3600
except (pytz.UnknownTimeZoneError, IndexError, ValueError):
return None
gdf["actual_tz_offset(h)"] = gdf["tzid"].apply(get_utc_offset)
gdf.dropna(subset=["actual_tz_offset(h)"], inplace=True)
gdf.loc[gdf["tzid"] == "Etc/UTC", "actual_tz_offset(h)"] = 0
gdf["ideal_tz_offset(h)"] = round(gdf["longitude"] / 15)
gdf["offset_deviation(h)"] = gdf["actual_tz_offset(h)"] - gdf["ideal_tz_offset(h)"]
In [ ]:
largest_deviations = gdf.sort_values(by="offset_deviation(h)", ascending=False)
print("Time zones with the largest deviations from the ideal:")
largest_deviations.loc[largest_deviations["offset_deviation(h)"] != 0, ["tzid", "offset_deviation(h)"]]
Out[Â ]:
Training¶
In [ ]:
train_data = gdf.loc[np.isclose(gdf["offset_deviation(h)"], 0, atol=0.1)].copy()
test_data = gdf.loc[~np.isclose(gdf["offset_deviation(h)"], 0, atol=0.1)].copy()
X_train = train_data[["longitude", "latitude"]]
y_train = train_data["ideal_tz_offset(h)"]
X_test = test_data[["longitude", "latitude"]]
In [ ]:
knn = KNeighborsRegressor(n_neighbors=5, weights="distance")
knn.fit(X_train, y_train)
predicted_ideal_tz_knn = knn.predict(X_test)
test_data.loc[:, "knn_predicted_ideal_tz_offset"] = predicted_ideal_tz_knn
In [ ]:
rf = RandomForestRegressor(
n_estimators=32,
max_depth=4,
bootstrap=True,
random_state=25
)
rf.fit(X_train, y_train)
predicted_ideal_tz_rf = rf.predict(X_test)
test_data.loc[:, "predicted_rf_tz_offset"] = predicted_ideal_tz_rf
In [ ]:
xgb = XGBRegressor(
max_depth=5,
learning_rate=0.04,
n_estimators=128,
subsample=0.8,
reg_alpha=0.5,
reg_lambda=1.0,
random_state=25,
objective="reg:absoluteerror",
)
xgb.fit(X_train, y_train)
predicted_ideal_tz_xgb = xgb.predict(X_test)
test_data.loc[:, "predicted_xgb_tz_offset"] = predicted_ideal_tz_xgb
Testing¶
In [ ]:
y_true = test_data["ideal_tz_offset(h)"].to_numpy()
for name, preds in (
("KNN", predicted_ideal_tz_knn),
("RF", predicted_ideal_tz_rf),
("XGB", predicted_ideal_tz_xgb),
):
mae = mean_absolute_error(y_true, preds)
mse = mean_squared_error(y_true, preds)
rmse = root_mean_squared_error(y_true, preds)
r2 = r2_score(y_true, preds)
print(f"{name} — MAE: {mae:.4f}, RMSE: {rmse:.4f}, MSE: {mse:.4f}, R2: {r2:.4f}")
Results¶
In [ ]:
print("--- Model Predictions for Deviating Time Zones ---")
test_data[["tzid", "actual_tz_offset(h)", "ideal_tz_offset(h)", "offset_deviation(h)", "knn_predicted_ideal_tz_offset", "predicted_rf_tz_offset", "predicted_xgb_tz_offset"]].sort_values(by="offset_deviation(h)", ascending=False)
Out[Â ]:
In [ ]:
print("--- Analyzed deltas ---\n")
for col, name in {
"offset_deviation(h)": "calculated".capitalize(),
"knn_predicted_ideal_tz_offset": "knn".upper(),
"predicted_rf_tz_offset": "rf".upper(),
"predicted_xgb_tz_offset": "XGBoost"
}.items():
label = [col]
mean_val = np.mean(test_data[col])
median_val = np.median(test_data[col])
print(f"{name} - Mean: {mean_val:.4f}, Median: {median_val:.4f}", end="\n\n")
Models' Vis¶
In [ ]:
plt.figure(figsize=(8,5))
plot_importance(xgb, importance_type="weight")
plt.show()
plt.bar(X_train.columns, rf.feature_importances_, color="skyblue")
plt.ylabel("Feature importance")
plt.show()
In [ ]:
random.seed(25)
n_trees = 32
fig, axes = plt.subplots(nrows=8, ncols=4, figsize=(20, 40))
axes = axes.ravel()
for i in range(n_trees):
plot_tree(
rf.estimators_[i],
feature_names=X_train.columns,
filled=True,
rounded=True,
fontsize=10,
ax=axes[i]
)
axes[i].set_title(f'Tree {i}')
plt.tight_layout()
plt.show()
In [ ]:
n_trees = 32
start_idx = 128 - n_trees
fig, axes = plt.subplots(nrows=8, ncols=4, figsize=(20, 40))
axes = axes.ravel()
for i in range(n_trees):
tree_idx = start_idx + i
xg_plot_tree(
xgb,
tree_idx=tree_idx,
rankdir="TB",
ax=axes[i]
)
axes[i].set_title(f'Tree {tree_idx}')
plt.tight_layout()
plt.show()