diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml deleted file mode 100644 index bffa0b5..0000000 --- a/.github/workflows/publish-to-pypi.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Publish to PyPI - -on: - push: - branches: - - main - -jobs: - pypi-publish: - name: upload release to PyPI - runs-on: ubuntu-latest - # Specifying a GitHub environment is optional, but strongly encouraged - # environment: release - permissions: - # IMPORTANT: this permission is mandatory for trusted publishing - id-token: write - steps: - # retrieve your distributions here - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Publish package distributions to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..e80bd9a --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,40 @@ +name: Publish Python Package + +on: + push: + branches: + - main + - master + pull_request: + branches: + - main + - master + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + + - name: Build package + run: | + python setup.py sdist bdist_wheel + + - name: Publish package + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + twine upload dist/* \ No newline at end of file diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml deleted file mode 100644 index bdaab28..0000000 --- a/.github/workflows/python-publish.yml +++ /dev/null @@ -1,39 +0,0 @@ -# This workflow will upload a Python Package using Twine when a release is created -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries - -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -name: Upload Python Package - -on: - release: - types: [published] - -permissions: - contents: read - -jobs: - deploy: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.x' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Build package - run: python -m build - - name: Publish package - uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/gtfs_functions/__init__.py b/gtfs_functions/__init__.py index 1fbf778..1ac337a 100644 --- a/gtfs_functions/__init__.py +++ b/gtfs_functions/__init__.py @@ -1,2 +1,3 @@ from gtfs_functions.gtfs_functions import Feed -# from gtfs_functions.gtfs_plots import map_gdf \ No newline at end of file + +# from gtfs_functions.gtfs_plots import map_gdf diff --git a/gtfs_functions/aux_functions.py b/gtfs_functions/aux_functions.py index 7aaa80d..6db210c 100644 --- a/gtfs_functions/aux_functions.py +++ b/gtfs_functions/aux_functions.py @@ -8,100 +8,136 @@ def add_runtime(st): # Get the runtime between stops - logging.info('adding runtime') - st.sort_values(by=['trip_id', 'stop_sequence'], inplace=True, ascending=True) + logging.info("adding runtime") + st.sort_values(by=["trip_id", "stop_sequence"], inplace=True, ascending=True) c = st.trip_id == st.trip_id.shift(-1) - st.loc[c, 'runtime_sec'] = st.arrival_time.shift(-1)[c] - st.arrival_time[c] - st['end_stop_id'] = st.stop_id.shift(-1) + st.loc[c, "runtime_sec"] = st.arrival_time.shift(-1)[c] - st.arrival_time[c] + st["end_stop_id"] = st.stop_id.shift(-1) return st def add_distance( - stop_times, segments_gdf, - seg_cols=[ - 'shape_id', 'route_id', 'direction_id', 'stop_sequence', - 'segment_id', 'segment_name', - 'start_stop_id', 'end_stop_id', 'start_stop_name', - 'end_stop_name','distance_m', 'geometry'], - st_cols=[ - 'shape_id', 'route_id', 'route_name', 'direction_id', - 'stop_sequence', 'stop_id', 'end_stop_id', 'runtime_sec', - 'arrival_time', 'departure_time']): - logging.info('adding distance in meters') + stop_times, + segments_gdf, + seg_cols=[ + "shape_id", + "route_id", + "direction_id", + "stop_sequence", + "segment_id", + "segment_name", + "start_stop_id", + "end_stop_id", + "start_stop_name", + "end_stop_name", + "distance_m", + "geometry", + ], + st_cols=[ + "shape_id", + "route_id", + "route_name", + "direction_id", + "stop_sequence", + "stop_id", + "end_stop_id", + "runtime_sec", + "arrival_time", + "departure_time", + ], +): + logging.info("adding distance in meters") st = stop_times[st_cols] - st.rename(columns={'stop_id': 'start_stop_id'}, inplace=True) + st.rename(columns={"stop_id": "start_stop_id"}, inplace=True) # Merge with segments_gdf to get the distance - dist = pd.merge(st, segments_gdf[seg_cols], how='left') - dist = gpd.GeoDataFrame(data=dist, geometry=dist.geometry, crs='EPSG:4326') - + dist = pd.merge(st, segments_gdf[seg_cols], how="left") + dist = gpd.GeoDataFrame(data=dist, geometry=dist.geometry, crs="EPSG:4326") + return dist def add_speed(speeds): # Calculate the speed for runtimes != 0 - logging.info('calculating speed in km/h') + logging.info("calculating speed in km/h") c = speeds.runtime_sec != 0 - speeds.loc[c, 'speed_kmh'] = round( - speeds[c].distance_m / speeds[c].runtime_sec * 3.6) + speeds.loc[c, "speed_kmh"] = round(speeds[c].distance_m / speeds[c].runtime_sec * 3.6) # Assign average speed to those with runtimes==0 - speeds.loc[~c, 'speed_kmh'] = speeds[c].speed_kmh.mean() + speeds.loc[~c, "speed_kmh"] = speeds[c].speed_kmh.mean() # Remove null values speeds = speeds.loc[~speeds.speed_kmh.isnull()] - + return speeds def fix_outliers(speeds): # Calculate average speed to modify outliers - logging.info('fixing outliers') + logging.info("fixing outliers") avg_speed_route = speeds.pivot_table( - 'speed_kmh', - index=['route_id', 'direction_id', 'window'], - aggfunc='mean').reset_index() + "speed_kmh", index=["route_id", "direction_id", "window"], aggfunc="mean" + ).reset_index() - avg_speed_route.rename(columns={'speed_kmh': 'avg_route_speed_kmh'}, inplace=True) + avg_speed_route.rename(columns={"speed_kmh": "avg_route_speed_kmh"}, inplace=True) # Assign average speed to outliers - speeds = pd.merge(speeds, avg_speed_route, how='left') + speeds = pd.merge(speeds, avg_speed_route, how="left") out_c = speeds.speed_kmh > 120 - speeds.loc[out_c, 'speed_kmh'] = speeds.loc[out_c, 'avg_route_speed_kmh'] + speeds.loc[out_c, "speed_kmh"] = speeds.loc[out_c, "avg_route_speed_kmh"] # Get the columns in the right format - speeds['avg_route_speed_kmh'] = round(speeds.avg_route_speed_kmh, 1) + speeds["avg_route_speed_kmh"] = round(speeds.avg_route_speed_kmh, 1) return speeds def aggregate_speed(speeds, segments_gdf): # Get the average per route, direction, segment and time of day - logging.info('aggregating speed by segment and window') + logging.info("aggregating speed by segment and window") speeds_agg = speeds.pivot_table( - ['speed_kmh', 'runtime_sec', 'avg_route_speed_kmh'], - index=['route_name', 'direction_id', 'segment_id','window'], - aggfunc='mean').reset_index() + ["speed_kmh", "runtime_sec", "avg_route_speed_kmh"], + index=["route_name", "direction_id", "segment_id", "window"], + aggfunc="mean", + ).reset_index() # Format the merge columns correctly - speeds_agg['direction_id'] = speeds_agg.direction_id.astype(int) - segments_gdf['direction_id'] = segments_gdf.direction_id.astype(int) - + speeds_agg["direction_id"] = speeds_agg.direction_id.astype(int) + segments_gdf["direction_id"] = segments_gdf.direction_id.astype(int) # Add geometries to segments - data = pd.merge( - speeds_agg, segments_gdf, - left_on=['route_name', 'direction_id', 'segment_id'], - right_on=['route_name', 'direction_id', 'segment_id'], - how='left').reset_index(drop=True).sort_values( - by=['route_id', 'direction_id', 'window', 'stop_sequence'], - ascending=True) - - ordered_cols = ['route_id', 'route_name', 'direction_id', 'segment_id', 'window', - 'speed_kmh', 'avg_route_speed_kmh','stop_sequence', 'segment_name', - 'start_stop_name', 'end_stop_name', 'start_stop_id', 'end_stop_id', 'shape_id', - 'runtime_sec', 'distance_m', 'geometry'] + data = ( + pd.merge( + speeds_agg, + segments_gdf, + left_on=["route_name", "direction_id", "segment_id"], + right_on=["route_name", "direction_id", "segment_id"], + how="left", + ) + .reset_index(drop=True) + .sort_values(by=["route_id", "direction_id", "window", "stop_sequence"], ascending=True) + ) + + ordered_cols = [ + "route_id", + "route_name", + "direction_id", + "segment_id", + "window", + "speed_kmh", + "avg_route_speed_kmh", + "stop_sequence", + "segment_name", + "start_stop_name", + "end_stop_name", + "start_stop_id", + "end_stop_id", + "shape_id", + "runtime_sec", + "distance_m", + "geometry", + ] return data[ordered_cols] @@ -110,20 +146,26 @@ def get_all_lines_speed(speeds, segments_gdf): # Get the average per segment and time of day # Then add it to the rest of the data all_lines = speeds.pivot_table( - ['speed_kmh', 'runtime_sec', 'avg_route_speed_kmh'], - index=['segment_id', 'window'], - aggfunc='mean').reset_index() - - data_all_lines = pd.merge( - all_lines, - segments_gdf.drop_duplicates(subset=['segment_id']), - left_on=['segment_id'], right_on=['segment_id'], - how='left').reset_index(drop=True).sort_values( - by=['direction_id', 'window', 'stop_sequence'], ascending=True) + ["speed_kmh", "runtime_sec", "avg_route_speed_kmh"], + index=["segment_id", "window"], + aggfunc="mean", + ).reset_index() + + data_all_lines = ( + pd.merge( + all_lines, + segments_gdf.drop_duplicates(subset=["segment_id"]), + left_on=["segment_id"], + right_on=["segment_id"], + how="left", + ) + .reset_index(drop=True) + .sort_values(by=["direction_id", "window", "stop_sequence"], ascending=True) + ) - data_all_lines['route_id'] = 'ALL_LINES' - data_all_lines['route_name'] = 'All lines' - data_all_lines['direction_id'] = 'NA' + data_all_lines["route_id"] = "ALL_LINES" + data_all_lines["route_name"] = "All lines" + data_all_lines["direction_id"] = "NA" return data_all_lines @@ -136,81 +178,103 @@ def add_all_lines_speed(data, speeds, segments_gdf): data_complete = pd.concat([data, data_all_lines]) # Clean data - data_complete = data_complete[ - ~data_complete.route_name.isnull()].reset_index(drop=True) + data_complete = data_complete[~data_complete.route_name.isnull()].reset_index(drop=True) # Get the columns in the right format - data_complete['speed_kmh'] = round(data_complete.speed_kmh, 1) + data_complete["speed_kmh"] = round(data_complete.speed_kmh, 1) cols = [ - 'route_id', 'route_name', 'direction_id', 'segment_name', 'window', - 'speed_kmh', - 'segment_id', - 'start_stop_id', 'start_stop_name', 'end_stop_id', 'end_stop_name', - 'distance_m', 'stop_sequence', 'shape_id', 'runtime_sec', 'geometry'] + "route_id", + "route_name", + "direction_id", + "segment_name", + "window", + "speed_kmh", + "segment_id", + "start_stop_id", + "start_stop_name", + "end_stop_id", + "end_stop_name", + "distance_m", + "stop_sequence", + "shape_id", + "runtime_sec", + "geometry", + ] return data_complete def add_free_flow(speeds, data_complete): # Calculate max speed per segment to have a free_flow reference - max_speed_segment = speeds.pivot_table( - 'speed_kmh', - index='segment_name', - aggfunc='max') + max_speed_segment = speeds.pivot_table("speed_kmh", index="segment_name", aggfunc="max") - max_speed_segment.rename(columns={'speed_kmh': 'segment_max_speed_kmh'}, inplace=True) + max_speed_segment.rename(columns={"speed_kmh": "segment_max_speed_kmh"}, inplace=True) # Assign max speeds to each segment data_complete = pd.merge( - data_complete, max_speed_segment, - left_on=['segment_name'], + data_complete, + max_speed_segment, + left_on=["segment_name"], right_index=True, - how='left') - + how="left", + ) + order_cols = [ - 'route_name', 'direction_id', 'window', 'segment_name', 'stop_sequence', - 'speed_kmh', 'avg_route_speed_kmh', 'segment_max_speed_kmh', 'route_id', 'segment_id', - 'start_stop_name', 'end_stop_name', 'start_stop_id', 'end_stop_id', - 'shape_id', 'runtime_sec', 'distance_m', 'geometry' + "route_name", + "direction_id", + "window", + "segment_name", + "stop_sequence", + "speed_kmh", + "avg_route_speed_kmh", + "segment_max_speed_kmh", + "route_id", + "segment_id", + "start_stop_name", + "end_stop_name", + "start_stop_id", + "end_stop_id", + "shape_id", + "runtime_sec", + "distance_m", + "geometry", ] return data_complete -def add_all_lines( - line_frequencies, - segments_gdf, - labels, - cutoffs): - - logging.info('adding data for all lines.') - +def add_all_lines(line_frequencies, segments_gdf, labels, cutoffs): + + logging.info("adding data for all lines.") + # Calculate sum of trips per segment with all lines - all_lines = line_frequencies.pivot_table( - ['ntrips'], - index=['segment_id', 'window'], - aggfunc='sum').reset_index() + all_lines = line_frequencies.pivot_table(["ntrips"], index=["segment_id", "window"], aggfunc="sum").reset_index() - sort_these = ['direction_id', 'window', 'stop_sequence'] + sort_these = ["direction_id", "window", "stop_sequence"] - data_all_lines = pd.merge( - all_lines, - segments_gdf.drop_duplicates(subset=['segment_id']), - left_on=['segment_id'], right_on=['segment_id'], - how='left').reset_index().sort_values(by=sort_these, ascending=True) + data_all_lines = ( + pd.merge( + all_lines, + segments_gdf.drop_duplicates(subset=["segment_id"]), + left_on=["segment_id"], + right_on=["segment_id"], + how="left", + ) + .reset_index() + .sort_values(by=sort_these, ascending=True) + ) - data_all_lines.drop(['index'], axis=1, inplace=True) - data_all_lines['route_id'] = 'ALL_LINES' - data_all_lines['route_name'] = 'All lines' - data_all_lines['direction_id'] = 'NA' + data_all_lines.drop(["index"], axis=1, inplace=True) + data_all_lines["route_id"] = "ALL_LINES" + data_all_lines["route_name"] = "All lines" + data_all_lines["direction_id"] = "NA" # Add frequency for all lines start_time = data_all_lines.window.apply(lambda x: cutoffs[labels.index(x)]) end_time = data_all_lines.window.apply(lambda x: cutoffs[labels.index(x) + 1]) - data_all_lines['min_per_trip'] = ((end_time - start_time)*60 / data_all_lines.ntrips)\ - .astype(int) + data_all_lines["min_per_trip"] = ((end_time - start_time) * 60 / data_all_lines.ntrips).astype(int) # Append data for all lines to the input df data_complete = pd.concat([line_frequencies, data_all_lines]).reset_index(drop=True) @@ -226,8 +290,8 @@ def fix_departure_time(times_to_fix): - times_to_fix: np.array of integers with seconds past from midnight. """ - next_day = times_to_fix >= 24*3600 - times_to_fix[next_day] = times_to_fix[next_day] - 24 * 3600 + next_day = times_to_fix >= 24 * 3600 + times_to_fix[next_day] = times_to_fix[next_day] - 24 * 3600 return times_to_fix @@ -240,39 +304,39 @@ def label_creation(cutoffs): Output: - labels: list of strings. - Example: + Example: label_creation(cutoffs=[0, 10, 15.5, 25]) --> [0:00, 10:00, 15:30, 25:00] """ labels = [] if max(cutoffs) <= 24: for w in cutoffs: if float(w).is_integer(): - label = str(w) + ':00' + label = str(w) + ":00" else: n = math.modf(w) - label = str(int(n[1])) + ':' + str(int(n[0]*60)) + label = str(int(n[1])) + ":" + str(int(n[0] * 60)) labels.append(label) else: labels = [] for w in cutoffs: if float(w).is_integer(): if w > 24: - w1 = w-24 - label = str(w1) + ':00' + w1 = w - 24 + label = str(w1) + ":00" else: - label = str(w) + ':00' + label = str(w) + ":00" labels.append(label) else: if w > 24: - w1 = w-24 + w1 = w - 24 n = math.modf(w1) - label = str(int(n[1])) + ':' + str(int(n[0]*60)) + label = str(int(n[1])) + ":" + str(int(n[0] * 60)) else: n = math.modf(w) - label = str(int(n[1])) + ':' + str(int(n[0]*60)) + label = str(int(n[1])) + ":" + str(int(n[0] * 60)) labels.append(label) - labels = [labels[i] + '-' + labels[i+1] for i in range(0, len(labels)-1)] + labels = [labels[i] + "-" + labels[i + 1] for i in range(0, len(labels) - 1)] return labels @@ -283,9 +347,9 @@ def window_creation(stop_times, cutoffs): # If the cutoffs are withing 0 and 24 hours, let's make sure # the times of the GTFS fit this time period if max(cutoffs) <= 24: - stop_times['departure_time'] = fix_departure_time(stop_times.departure_time.values) - stop_times['arrival_time'] = fix_departure_time(stop_times.arrival_time.values) - + stop_times["departure_time"] = fix_departure_time(stop_times.departure_time.values) + stop_times["arrival_time"] = fix_departure_time(stop_times.arrival_time.values) + # Create the labels for the cutoffs labels = label_creation(cutoffs) @@ -293,102 +357,102 @@ def window_creation(stop_times, cutoffs): departure_time = stop_times.departure_time / 3600 # Put each trip in the right window - stop_times['window'] = pd.cut( - departure_time, bins=cutoffs, right=False, labels=labels) + stop_times["window"] = pd.cut(departure_time, bins=cutoffs, right=False, labels=labels) stop_times = stop_times.loc[~stop_times.window.isnull()] - stop_times['window'] = stop_times.window.astype(str) + stop_times["window"] = stop_times.window.astype(str) return stop_times def seconds_since_midnight(times_string): """ - Transforms a series of time strings of the form "10:00:10" + Transforms a series of time strings of the form "10:00:10" to an integer that represents the seconds since midnight. """ - vals = times_string.split(':') + vals = times_string.split(":") seconds = 0 for p, v in enumerate(vals): - seconds += int(v) * (3600/(60**p)) + seconds += int(v) * (3600 / (60**p)) return seconds def add_frequency( - stop_times, labels, index_='stop_id', col='window', - cutoffs=[0, 6, 9, 15, 19, 22, 24]): - + stop_times, + labels, + index_="stop_id", + col="window", + cutoffs=[0, 6, 9, 15, 19, 22, 24], +): + if isinstance(index_, list): - index_list = index_ + ['direction_id', col] + index_list = index_ + ["direction_id", col] elif isinstance(index_, str): - index_list = [index_, 'direction_id', col] + index_list = [index_, "direction_id", col] # Some gtfs feeds only contain direction_id 0, use that as default - trips_agg = stop_times.pivot_table( - 'trip_id', index=index_list, - aggfunc='count').reset_index() + trips_agg = stop_times.pivot_table("trip_id", index=index_list, aggfunc="count").reset_index() # direction_id is optional, as it is not needed to determine trip frequencies # However, if direction_id is NaN, pivot_table will return an empty DataFrame. # Therefore, use a sensible default if direction id is not known. # Some gtfs feeds only contain direction_id 0, use that as default - trips_agg.rename(columns={'trip_id': 'ntrips'}, inplace=True) + trips_agg.rename(columns={"trip_id": "ntrips"}, inplace=True) start_time = trips_agg.window.apply(lambda x: cutoffs[labels.index(x)]) end_time = trips_agg.window.apply(lambda x: cutoffs[labels.index(x) + 1]) - trips_agg['min_per_trip'] = ((end_time - start_time)*60 / trips_agg.ntrips)\ - .astype(int) + trips_agg["min_per_trip"] = ((end_time - start_time) * 60 / trips_agg.ntrips).astype(int) return trips_agg def add_route_name(data, routes): # Add the route name - routes['route_name'] = '' + routes["route_name"] = "" def check_null(col): # Check for null values check = ( - routes[col].isnull().unique()[0] | - (routes[col] == np.nan).unique()[0] | - (routes[col] == 'nan').unique()[0] + routes[col].isnull().unique()[0] | (routes[col] == np.nan).unique()[0] | (routes[col] == "nan").unique()[0] ) return check - if check_null('route_short_name'): - routes['route_name'] = routes.route_long_name - elif check_null('route_long_name'): - routes['route_name'] = routes.route_short_name + if check_null("route_short_name"): + routes["route_name"] = routes.route_long_name + elif check_null("route_long_name"): + routes["route_name"] = routes.route_short_name else: - routes['route_name'] =\ - routes.route_short_name.astype(str)\ - + ' ' + routes.route_long_name.astype(str) + routes["route_name"] = routes.route_short_name.astype(str) + " " + routes.route_long_name.astype(str) data = pd.merge( - data, routes[['route_id', 'route_name']], - left_on='route_id', right_on='route_id', how='left') + data, + routes[["route_id", "route_name"]], + left_on="route_id", + right_on="route_id", + how="left", + ) return data def code(gdf): - gdf.index=list(range(0,len(gdf))) - gdf.crs = {'init':'epsg:4326'} + gdf.index = list(range(0, len(gdf))) + gdf.crs = {"init": "epsg:4326"} lat_referece = gdf.geometry[0].coords[0][1] lon_reference = gdf.geometry[0].coords[0][0] zone = utm.from_latlon(lat_referece, lon_reference) - #The EPSG code is 32600+zone for positive latitudes and 32700+zone for negatives. - if lat_referece <0: + # The EPSG code is 32600+zone for positive latitudes and 32700+zone for negatives. + if lat_referece < 0: epsg_code = 32700 + zone[2] else: epsg_code = 32600 + zone[2] - + return epsg_code diff --git a/gtfs_functions/gtfs_functions.py b/gtfs_functions/gtfs_functions.py index a47867f..122c4a2 100644 --- a/gtfs_functions/gtfs_functions.py +++ b/gtfs_functions/gtfs_functions.py @@ -4,24 +4,39 @@ import os import logging import geopandas as gpd -import logging -import requests, io +import requests +import io import pendulum as pl import hashlib from shapely.geometry import LineString, MultiPoint -from gtfs_functions.aux_functions import * +from gtfs_functions.aux_functions import ( + add_all_lines, + add_runtime, + add_distance, + add_speed, + code, + fix_outliers, + num_to_letters, + add_route_name, + seconds_since_midnight, + window_creation, + label_creation, + add_frequency, + aggregate_speed, + add_all_lines_speed, + add_free_flow +) from itertools import permutations, chain from shapely import distance -from h3 import geo_to_h3, k_ring +from h3 import latlng_to_cell, grid_ring from time import time import boto3 -import io import sys -import pendulum as pl if not sys.warnoptions: import warnings + warnings.simplefilter("ignore") logging.basicConfig(level=logging.INFO) @@ -29,15 +44,15 @@ class Feed: def __init__( - self, - gtfs_path: str, - time_windows: list = [0, 6, 9, 15, 19, 22, 24], - busiest_date: bool = True, - geo: bool = True, - patterns: bool = True, - start_date: str = None, - end_date: str = None - ): + self, + gtfs_path: str, + time_windows: list = [0, 6, 9, 15, 19, 22, 24], + busiest_date: bool = True, + geo: bool = True, + patterns: bool = True, + start_date: str = None, + end_date: str = None, + ): self._gtfs_path = gtfs_path self._time_windows = time_windows @@ -72,11 +87,11 @@ def __init__( @property def gtfs_path(self): return self._gtfs_path - + @property def time_windows(self): return self._time_windows - + @property def busiest_date(self): return self._busiest_date @@ -89,7 +104,7 @@ def geo(self): def files(self): if self._files is None: self._files = self.get_files() - + return self._files @property @@ -101,7 +116,7 @@ def bbox(self): @property def start_date(self): return self._start_date - + @property def end_date(self): return self._end_date @@ -143,39 +158,38 @@ def busiest_service_id(self): """ if self._busiest_service_id is None: self._busiest_service_id = self.get_busiest_service_id() - + return self._busiest_service_id - + @property def agency(self): if self._agency is None: self._agency = self.get_agency() - + return self._agency @property def calendar(self): if self._calendar is None: self._calendar = self.get_calendar() - + return self._calendar @property def calendar_dates(self): if self._calendar_dates is None: self._calendar_dates = self.get_calendar_dates() - + return self._calendar_dates @property def trips(self): - logging.info('accessing trips') + logging.info("accessing trips") if self._trips is None: self._trips = self.get_trips() if self._patterns and self._trips_patterns is None: - (trips_patterns, routes_patterns) = self.get_routes_patterns( - self._trips) + (trips_patterns, routes_patterns) = self.get_routes_patterns(self._trips) self._trips_patterns = trips_patterns self._routes_patterns = routes_patterns return self._trips_patterns @@ -183,68 +197,68 @@ def trips(self): return self._trips_patterns return self._trips - + @property def routes(self): if self._routes is None: self._routes = self.get_routes() - + return self._routes - + @property def stops(self): if self._stops is None: self._stops = self.get_stops() - + return self._stops - + @property def stop_times(self): if self._stop_times is None: self._stop_times = self.get_stop_times() - + return self._stop_times - + @property def shapes(self): if self._shapes is None: self._shapes = self.get_shapes() - + return self._shapes - + @property def stops_freq(self): if self._stops_freq is None: self._stops_freq = self.get_stops_freq() - + return self._stops_freq - + @property def lines_freq(self): if self._lines_freq is None: self._lines_freq = self.get_lines_freq() - + return self._lines_freq - + @property def segments(self): if self._segments is None: self._segments = self.get_segments() return self._segments - + @property def segments_freq(self): if self._segments_freq is None: self._segments_freq = self.get_segments_freq() return self._segments_freq - + @property def speeds(self): if self._speeds is None: self._speeds = self.get_speeds() - + return self._speeds @property @@ -253,20 +267,20 @@ def avg_speeds(self): self._avg_speeds = self.get_avg_speeds() return self._avg_speeds - + @property def distance_matrix(self): if self._dist_matrix is None: self._dist_matrix = self.get_distance_between_stops() return self._dist_matrix - + @property def dates_service_id(self): if self._dates_service_id is None: self._dates_service_id = self.get_dates_service_id() return self._dates_service_id - + @trips.setter def trips(self, value): self._trips = value @@ -291,17 +305,16 @@ def shapes(self, value): def dates_service_id(self, value): self._dates_service_id = value - def get_files(self): gtfs_path = self.gtfs_path # S3 implementation - if gtfs_path.split('://')[0]=='s3': - s3 = boto3.resource('s3') - bucket = gtfs_path.split('://')[1].split('/')[0] + if gtfs_path.split("://")[0] == "s3": + s3 = boto3.resource("s3") + bucket = gtfs_path.split("://")[1].split("/")[0] boto_bucket = s3.Bucket(bucket) - key = '/'.join(gtfs_path.split('/')[3:]) - + key = "/".join(gtfs_path.split("/")[3:]) + with io.BytesIO() as data: boto_bucket.download_fileobj(key, data) with ZipFile(data) as myzip: @@ -309,19 +322,18 @@ def get_files(self): else: try: with ZipFile(gtfs_path) as myzip: - return myzip.namelist() + return myzip.namelist() # Try as a URL if the file is not in local except (FileNotFoundError, OSError) as e: - + logging.error(e) r = requests.get(self.gtfs_path) with ZipFile(io.BytesIO(r.content)) as myzip: return myzip.namelist() - def get_bbox(self): - logging.info('Getting the bounding box.') - stops = extract_file('stops', self) + logging.info("Getting the bounding box.") + stops = extract_file("stops", self) max_x = stops.stop_lon.max() min_x = stops.stop_lon.min() @@ -329,16 +341,16 @@ def get_bbox(self): min_y = stops.stop_lat.min() geo = { - 'type': 'Polygon', - 'coordinates': [ + "type": "Polygon", + "coordinates": [ [ [max_x, max_y], [max_x, min_y], [min_x, min_y], [min_x, max_y], - [max_x, max_y] + [max_x, max_y], ] - ] + ], } return geo @@ -347,118 +359,147 @@ def get_dates(self): start_date = self.start_date end_date = self.end_date if start_date is not None: - pl_start_date = pl.from_format(start_date, 'YYYY-MM-DD') - - if end_date is not None: - pl_end_date = pl.from_format(end_date, 'YYYY-MM-DD') - - + pl_start_date = pl.from_format(start_date, "YYYY-MM-DD") + + if end_date is not None: + pl_end_date = pl.from_format(end_date, "YYYY-MM-DD") + elif end_date is None: - logging.info('End date is None so we will take today as end date.') - + logging.info("End date is None so we will take today as end date.") + pl_end_date = pl.today() - + # Get all dates between start and end date period = pl.interval(pl_start_date, pl_end_date) return [day.to_date_string() for day in period] else: - logging.info('Start date is None. You should either specify a start date or set busiest_date to True.') + logging.info("Start date is None. You should either specify a start date or set busiest_date to True.") return [] - def get_routes_patterns(self, trips): """ Compute the different patterns of each route. returns (trips_patterns, routes_patterns) """ stop_times = self.stop_times - logging.info('computing patterns') + logging.info("computing patterns") trip_stops = stop_times[ - ['route_id', 'route_name', 'direction_id', 'shape_id', - 'trip_id', 'stop_id', 'stop_sequence']] - trip_stops['zipped_stops'] = list( - zip(trip_stops.stop_id, trip_stops.stop_sequence)) + [ + "route_id", + "route_name", + "direction_id", + "shape_id", + "trip_id", + "stop_id", + "stop_sequence", + ] + ] + trip_stops["zipped_stops"] = list(zip(trip_stops.stop_id, trip_stops.stop_sequence)) trip_stops_zipped = trip_stops.pivot_table( - 'zipped_stops', - index=['trip_id', 'route_id', 'route_name', 'direction_id', 'shape_id'], - aggfunc=list + "zipped_stops", + index=["trip_id", "route_id", "route_name", "direction_id", "shape_id"], + aggfunc=list, ).reset_index() - + trips_with_stops = trips.merge(trip_stops_zipped) def version_hash(x): hash = hashlib.sha1(f"{x.route_id}{x.direction_id}{str(x.zipped_stops)}".encode("UTF-8")).hexdigest() return hash[:18] - trips_with_stops['pattern_id'] = trips_with_stops.apply( - version_hash, axis=1) + trips_with_stops["pattern_id"] = trips_with_stops.apply(version_hash, axis=1) # Count number of trips per pattern to identify the main one route_patterns = trips_with_stops.pivot_table( - 'trip_id', + "trip_id", index=[ - 'route_id', 'route_name', 'pattern_id', 'direction_id', - 'shape_id', trips_with_stops.zipped_stops.astype(str) - ], aggfunc='count').reset_index() - - route_patterns = route_patterns\ - .rename({'trip_id': 'cnt_trips'}, axis=1)\ - .sort_values( - by=['route_name', 'direction_id', 'cnt_trips'], - ascending=[True, True, False] - ).reset_index(drop=True) - + "route_id", + "route_name", + "pattern_id", + "direction_id", + "shape_id", + trips_with_stops.zipped_stops.astype(str), + ], + aggfunc="count", + ).reset_index() + + route_patterns = ( + route_patterns.rename({"trip_id": "cnt_trips"}, axis=1) + .sort_values( + by=["route_name", "direction_id", "cnt_trips"], + ascending=[True, True, False], + ) + .reset_index(drop=True) + ) + # Add simple names to patterns: A, B, C, etc. - n_patterns = route_patterns.pivot_table('cnt_trips', index=['route_name', 'direction_id'], aggfunc='count').reset_index() - n_patterns['route_pattern'] = n_patterns.cnt_trips.apply(lambda row: tuple(np.arange(1, row+1))) - n_patterns = n_patterns.explode('route_pattern').reset_index(drop=True) - n_patterns['route_pattern'] = n_patterns.route_pattern.apply(num_to_letters) - n_patterns['pattern_name'] = n_patterns.route_name + ' - ' + n_patterns.direction_id.astype(int).astype(str) + ' - ' + n_patterns.route_pattern - n_patterns.sort_values(by=['route_name', 'direction_id', 'route_pattern'], inplace=True) + n_patterns = route_patterns.pivot_table( + "cnt_trips", index=["route_name", "direction_id"], aggfunc="count" + ).reset_index() + n_patterns["route_pattern"] = n_patterns.cnt_trips.apply(lambda row: tuple(np.arange(1, row + 1))) + n_patterns = n_patterns.explode("route_pattern").reset_index(drop=True) + n_patterns["route_pattern"] = n_patterns.route_pattern.apply(num_to_letters) + n_patterns["pattern_name"] = ( + n_patterns.route_name + + " - " + + n_patterns.direction_id.astype(int).astype(str) + + " - " + + n_patterns.route_pattern + ) + n_patterns.sort_values(by=["route_name", "direction_id", "route_pattern"], inplace=True) route_patterns = route_patterns.merge( - n_patterns[['route_pattern', 'pattern_name']], - right_index=True, left_index=True, how='left') - + n_patterns[["route_pattern", "pattern_name"]], + right_index=True, + left_index=True, + how="left", + ) + # Bring the pattern names to trips trips_with_stops = trips_with_stops.merge( - route_patterns[['pattern_id', 'route_pattern', 'pattern_name']], - how='left') - trips_with_patterns = trips_with_stops[[ - 'trip_id', 'route_id', 'pattern_id', 'route_pattern', 'pattern_name','route_name', - 'service_id', 'direction_id', 'shape_id']] + route_patterns[["pattern_id", "route_pattern", "pattern_name"]], how="left" + ) + trips_with_patterns = trips_with_stops[ + [ + "trip_id", + "route_id", + "pattern_id", + "route_pattern", + "pattern_name", + "route_name", + "service_id", + "direction_id", + "shape_id", + ] + ] return trips_with_patterns.copy(), route_patterns.copy() - def get_busiest_service_id(self): """ Returns the service_id with most trips as a string. """ - trips = extract_file('trips', self) - return trips.pivot_table( - 'trip_id', index='service_id', aggfunc='count')\ - .sort_values(by='trip_id', ascending=False).index[0] - + trips = extract_file("trips", self) + return ( + trips.pivot_table("trip_id", index="service_id", aggfunc="count") + .sort_values(by="trip_id", ascending=False) + .index[0] + ) def get_dates_service_id(self): dates_service_id = self.parse_calendar() - return dates_service_id.groupby('date').service_id.apply(list) - + return dates_service_id.groupby("date").service_id.apply(list) def get_agency(self): - return extract_file('agency', self) - + return extract_file("agency", self) def get_calendar(self): - return extract_file('calendar', self) - + return extract_file("calendar", self) def get_calendar_dates(self): - return extract_file('calendar_dates', self) - + return extract_file("calendar_dates", self) def parse_calendar(self): calendar = self.calendar @@ -467,62 +508,66 @@ def parse_calendar(self): if calendar is not None: # Parse dates - calendar['start_date_dt'] = calendar.start_date.astype(str).apply(pl.parse) - calendar['end_date_dt'] = calendar.end_date.astype(str).apply(pl.parse) + calendar["start_date_dt"] = calendar.start_date.astype(str).apply(pl.parse) + calendar["end_date_dt"] = calendar.end_date.astype(str).apply(pl.parse) # Get all dates for a given service_id - calendar['all_dates'] = calendar.apply( - lambda x: np.array([ - d for d in pl.interval(x.start_date_dt, x.end_date_dt).range('days') - ]), axis=1 - ) - + calendar["all_dates"] = calendar.apply( + lambda x: np.array([d for d in pl.interval(x.start_date_dt, x.end_date_dt).range("days")]), + axis=1, + ) + # Boolean variables for day types cols = [ - 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', - 'saturday', 'sunday'] - + "monday", + "tuesday", + "wednesday", + "thursday", + "friday", + "saturday", + "sunday", + ] + vf = np.vectorize(bool) calendar[cols] = vf(calendar[cols].values) # Hash weekdays to make it faster def get_hash_weekdays(row): - return { - i: v - for i, v in enumerate(row[cols].values[0]) - } + return {i: v for i, v in enumerate(row[cols].values[0])} - hash_weekdays = calendar.groupby('service_id').apply(get_hash_weekdays) + hash_weekdays = calendar.groupby("service_id").apply(get_hash_weekdays) # Filter dates depending on the days of the week - calendar['filtered_dates'] = calendar.apply(lambda row: row.all_dates[ - [ - hash_weekdays[row.service_id][d.weekday()] - for d in row.all_dates - ]], axis=1) - + calendar["filtered_dates"] = calendar.apply( + lambda row: row.all_dates[[hash_weekdays[row.service_id][d.weekday()] for d in row.all_dates]], + axis=1, + ) + # Explode filtered_dates - t = calendar[['service_id', 'filtered_dates']].explode('filtered_dates') - + t = calendar[["service_id", "filtered_dates"]].explode("filtered_dates") + # Keep the service_ids that apply to at least one date t = t[t.filtered_dates.notnull()] - t['filtered_dates'] = t.filtered_dates.dt.date.astype(str) + t["filtered_dates"] = t.filtered_dates.dt.date.astype(str) - t = t.groupby('filtered_dates').service_id.apply(list) + t = t.groupby("filtered_dates").service_id.apply(list) # Create dictionary with dates as keys and service_id as items date_hash = t.apply(lambda x: dict(zip(x, [True] * len(x)))).to_dict() else: - date_hash = {} - + date_hash = {} + if calendar_dates is not None: # --- Do the same for calendar_dates --- - calendar_dates['date_str'] = calendar_dates.date.astype(str).apply(pl.parse)\ - .dt.date.astype(str) - - cdates_hash = calendar_dates[calendar_dates.exception_type==1].groupby('date_str')\ - .service_id.apply(list)\ - .apply(lambda x: dict(zip(x, [True] * len(x)))).to_dict() + calendar_dates["date_str"] = calendar_dates.date.astype(str).apply(pl.parse).dt.date.astype(str) + + cdates_hash = ( + calendar_dates[calendar_dates.exception_type == 1] + .groupby("date_str") + .service_id.apply(list) + .apply(lambda x: dict(zip(x, [True] * len(x)))) + .to_dict() + ) else: cdates_hash = {} @@ -546,35 +591,34 @@ def get_hash_weekdays(row): for d in remove_dates: dates.remove(d) - # Create dataframe with the service_id that applies to each date + # Create dataframe with the service_id that applies to each date aux = pd.concat([pd.DataFrame(date_hash), pd.DataFrame(cdates_hash)]).T.reset_index() - dates_service_id = pd.melt(aux, id_vars='index', value_vars=aux.columns) - dates_service_id.columns=['date', 'service_id', 'keep'] - - return dates_service_id[~dates_service_id.keep.isnull()] + dates_service_id = pd.melt(aux, id_vars="index", value_vars=aux.columns) + dates_service_id.columns = ["date", "service_id", "keep"] + return dates_service_id[~dates_service_id.keep.isnull()] def get_trips(self): routes = self.routes dates = self.dates - trips = extract_file('trips', self) - trips['trip_id'] = trips.trip_id.astype(str) - trips['route_id'] = trips.route_id.astype(str) + trips = extract_file("trips", self) + trips["trip_id"] = trips.trip_id.astype(str) + trips["route_id"] = trips.route_id.astype(str) - if 'shape_id' in trips.columns: - trips['shape_id'] = trips.shape_id.astype(str) + if "shape_id" in trips.columns: + trips["shape_id"] = trips.shape_id.astype(str) # If we were asked to only fetch the busiest date # if self.busiest_date: - # trips = trips[trips.service_id==self.busiest_service_id] + # trips = trips[trips.service_id==self.busiest_service_id] # If we're looking for the busiest date or a specific list of # dates we need to parse de calendar - if (self.busiest_date) | (dates!=[]): + if (self.busiest_date) | (dates != []): """ In the case we have three possibilites: - 1. busiest_date=True & dates==[]: in this case the user looks for the + 1. busiest_date=True & dates==[]: in this case the user looks for the busiest date in the entire feed 2. busiest_date=True & dates!=[]: in this case the user looks for the busiest date within the date range provided. @@ -587,158 +631,164 @@ def get_trips(self): # If busiest_date=True, we have to count the number of trips if self.busiest_date: # Trip per date - date_ntrips = trips.merge(dates_service_id).groupby(['date']).\ - trip_id.count().sort_values(ascending=False) - + date_ntrips = ( + trips.merge(dates_service_id).groupby(["date"]).trip_id.count().sort_values(ascending=False) + ) + # If we are looking for the busiest date within our date period, # we only keep the dates in that period of time. - if (self.busiest_date) & (dates!=[]): + if (self.busiest_date) & (dates != []): dates_service_id = dates_service_id[dates_service_id.date.isin(dates)] date_ntrips = date_ntrips[date_ntrips.index.isin(dates)] - - # Now that we've considered both cases we can just filter + + # Now that we've considered both cases we can just filter # with the busiest_date of the "dates" that made it this far if self.busiest_date: # In that case, if "dates" is empty we need to find the busiest date - busiest_date = list(date_ntrips[date_ntrips==date_ntrips.max()].index) - max_trips = date_ntrips[date_ntrips==date_ntrips.max()].values[0] + busiest_date = list(date_ntrips[date_ntrips == date_ntrips.max()].index) + max_trips = date_ntrips[date_ntrips == date_ntrips.max()].values[0] - logging.info(f'The busiest date/s of this feed or your selected date range is/are: {busiest_date} with {max_trips} trips.') - logging.info('In the case that more than one busiest date was found, the first one will be considered.') - logging.info(f'In this case is {busiest_date[0]}.') + logging.info( + "The busiest date/s of this feed or your selected date range" + + f" is/are: {busiest_date} with {max_trips} trips." + ) + logging.info("In the case that more than one busiest date was found, the first one will be considered.") + logging.info(f"In this case is {busiest_date[0]}.") # We need "dates" to be a list dates = busiest_date[:1] - + # Keep only the trips that are relevant to the use case - trips = trips.set_index('service_id').join( - dates_service_id[dates_service_id.date.isin(dates)]\ - .set_index('service_id'), - how='inner' - ).reset_index(names='service_id').drop(['keep', 'date'], axis=1).drop_duplicates() + trips = ( + trips.set_index("service_id") + .join( + dates_service_id[dates_service_id.date.isin(dates)].set_index("service_id"), + how="inner", + ) + .reset_index(names="service_id") + .drop(["keep", "date"], axis=1) + .drop_duplicates() + ) # Get routes info in trips # The GTFS feed might be missing some of the keys, e.g. direction_id or shape_id. # To allow processing incomplete GTFS data, we must reindex instead: # https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike # This will add NaN for any missing columns. - cols = ['trip_id', 'route_id', 'route_name', 'service_id', 'direction_id', 'shape_id'] + cols = [ + "trip_id", + "route_id", + "route_name", + "service_id", + "direction_id", + "shape_id", + ] trips = add_route_name(trips, routes).reindex(columns=cols) - + # Fill null values - trips['direction_id'] = trips.direction_id.fillna(0) + trips["direction_id"] = trips.direction_id.fillna(0) return trips - def get_routes(self): - routes = extract_file('routes', self) - routes['route_id'] = routes.route_id.astype(str) - - if 'route_short_name' in routes.columns: - routes['route_short_name'] = routes.route_short_name.astype(str) - if 'route_short_name' in routes.columns: - routes['route_long_name'] = routes.route_long_name.astype(str) - - return routes + routes = extract_file("routes", self) + routes["route_id"] = routes.route_id.astype(str) + if "route_short_name" in routes.columns: + routes["route_short_name"] = routes.route_short_name.astype(str) + if "route_short_name" in routes.columns: + routes["route_long_name"] = routes.route_long_name.astype(str) + + return routes def get_stops(self): - stops = extract_file('stops', self) - + stops = extract_file("stops", self) + if self.geo: # Add geometry to stops stops = gpd.GeoDataFrame( data=stops, - geometry=gpd.points_from_xy( - stops.stop_lon, stops.stop_lat), - crs=4326 + geometry=gpd.points_from_xy(stops.stop_lon, stops.stop_lat), + crs=4326, ) - stops['stop_id'] = stops.stop_id.astype(str) - stops['stop_name'] = stops.stop_name.astype(str) + stops["stop_id"] = stops.stop_id.astype(str) + stops["stop_name"] = stops.stop_name.astype(str) return stops - def get_stop_times(self): # Get trips, routes and stops info in stop_times - stop_times = extract_file('stop_times', self) - if self._trips is not None: # prevents infinite loop - logging.info('_trips is defined in stop_times') + stop_times = extract_file("stop_times", self) + if self._trips is not None: # prevents infinite loop + logging.info("_trips is defined in stop_times") trips = self._trips else: - logging.info('get trips in stop_times') + logging.info("get trips in stop_times") trips = self.trips stops = self.stops # Fix data types - stop_times['trip_id'] = stop_times.trip_id.astype(str) - stop_times['stop_id'] = stop_times.stop_id.astype(str) - - if 'route_id' in stop_times.columns: - stop_times['route_id'] = stop_times.route_id.astype(str) + stop_times["trip_id"] = stop_times.trip_id.astype(str) + stop_times["stop_id"] = stop_times.stop_id.astype(str) + + if "route_id" in stop_times.columns: + stop_times["route_id"] = stop_times.route_id.astype(str) - if 'shape_id' in stop_times.columns: - stop_times['shape_id'] = stop_times.shape_id.astype(str) + if "shape_id" in stop_times.columns: + stop_times["shape_id"] = stop_times.shape_id.astype(str) # We merge stop_times to "trips" (not the other way around) because # "trips" have already been filtered by the busiest service_id - stop_times = trips.merge(stop_times, how='inner') - + stop_times = trips.merge(stop_times, how="inner") + if self.geo: - stop_times = stop_times.merge(stops, how='left') + stop_times = stop_times.merge(stops, how="left") # stop_times needs to be geodataframe if we want to do geometry operations - stop_times = gpd.GeoDataFrame(stop_times, geometry='geometry') + stop_times = gpd.GeoDataFrame(stop_times, geometry="geometry") # direction_id is optional, as it is not needed to determine route shapes # However, if direction_id is NaN, pivot_table will return an empty DataFrame. # Therefore, use a sensible default if direction id is not known. # Some gtfs feeds only contain direction_id 0, use that as default - stop_times['direction_id'] = stop_times['direction_id'].fillna(0) + stop_times["direction_id"] = stop_times["direction_id"].fillna(0) # Pass times to seconds since midnight - stop_times['arrival_time'] = [ - seconds_since_midnight(t) - if t not in [None, np.nan] else None - for t in stop_times.arrival_time] - stop_times['departure_time'] = [ - seconds_since_midnight(t) - if t not in [None, np.nan] else None - for t in stop_times.departure_time] + stop_times["arrival_time"] = [ + seconds_since_midnight(t) if t not in [None, np.nan] else None for t in stop_times.arrival_time + ] + stop_times["departure_time"] = [ + seconds_since_midnight(t) if t not in [None, np.nan] else None for t in stop_times.departure_time + ] return stop_times - def get_shapes(self): if self.geo: - aux = extract_file('shapes', self) + aux = extract_file("shapes", self) # Sort shapes by shape_pt_sequence - aux.sort_values(['shape_id','shape_pt_sequence'], inplace=True) - shapes = aux[["shape_id", "shape_pt_lat", "shape_pt_lon"]]\ - .groupby("shape_id")\ - .agg(list)\ - .apply(lambda x: LineString(zip(x[1], x[0])), axis=1) - - shapes = gpd.GeoDataFrame( - data=shapes.index, - geometry = shapes.values, - crs=4326 + aux.sort_values(["shape_id", "shape_pt_sequence"], inplace=True) + shapes = ( + aux[["shape_id", "shape_pt_lat", "shape_pt_lon"]] + .groupby("shape_id") + .agg(list) + .apply(lambda x: LineString(zip(x[1], x[0])), axis=1) ) - shapes['shape_id'] = shapes.shape_id.astype(str) + + shapes = gpd.GeoDataFrame(data=shapes.index, geometry=shapes.values, crs=4326) + shapes["shape_id"] = shapes.shape_id.astype(str) return shapes else: - shapes = extract_file('shapes', self) - shapes['shape_id'] = shapes.shape_id.astype(str) + shapes = extract_file("shapes", self) + shapes["shape_id"] = shapes.shape_id.astype(str) return shapes - def get_stops_freq(self): """ - Get the stop frequencies. For each stop of each route it + Get the stop frequencies. For each stop of each route it returns the bus frequency in minutes/bus broken down by time window. """ @@ -746,40 +796,33 @@ def get_stops_freq(self): stops = self.stops cutoffs = self.time_windows - if 'window' not in stop_times.columns: + if "window" not in stop_times.columns: stop_times = window_creation(stop_times, cutoffs) else: - stop_times['window'] = stop_times.window.astype(str) + stop_times["window"] = stop_times.window.astype(str) labels = label_creation(cutoffs) - stop_frequencies = add_frequency( - stop_times, labels, index_='stop_id', - col='window', cutoffs=cutoffs) + stop_frequencies = add_frequency(stop_times, labels, index_="stop_id", col="window", cutoffs=cutoffs) if self.geo: - stops_cols = ['stop_id', 'stop_name', 'geometry'] + stops_cols = ["stop_id", "stop_name", "geometry"] else: - stops_cols = ['stop_id', 'stop_name'] - - stop_frequencies = stop_frequencies.merge( - stops[stops_cols], how='left') + stops_cols = ["stop_id", "stop_name"] + stop_frequencies = stop_frequencies.merge(stops[stops_cols], how="left") if self.geo: - stop_frequencies = gpd.GeoDataFrame( - data=stop_frequencies, - geometry=stop_frequencies.geometry) + stop_frequencies = gpd.GeoDataFrame(data=stop_frequencies, geometry=stop_frequencies.geometry) return stop_frequencies - def get_lines_freq(self): """ Calculates the frequency for each pattern of a route. Returns the bus frequency in minutes/bus broken down by time window. """ - + stop_times = self.stop_times shapes = self.shapes cutoffs = self.time_windows @@ -787,66 +830,78 @@ def get_lines_freq(self): stop_times_first = stop_times.loc[stop_times.stop_sequence == 1, :] # Create time windows - if 'window' not in stop_times_first.columns: + if "window" not in stop_times_first.columns: stop_times_first = window_creation(stop_times_first, cutoffs) else: - stop_times_first['window'] = stop_times_first.window.astype(str) + stop_times_first["window"] = stop_times_first.window.astype(str) # Create labels labels = label_creation(cutoffs) # Get frequencies line_frequencies = add_frequency( - stop_times_first, labels, index_=['route_id', 'route_name', 'shape_id'], - col='window', cutoffs=cutoffs) + stop_times_first, + labels, + index_=["route_id", "route_name", "shape_id"], + col="window", + cutoffs=cutoffs, + ) # Do we want a geodataframe? if self.geo: - line_frequencies = pd.merge(line_frequencies, shapes, how='left') - line_frequencies = gpd.GeoDataFrame( - data=line_frequencies, - geometry=line_frequencies.geometry, - crs=4326) + line_frequencies = pd.merge(line_frequencies, shapes, how="left") + line_frequencies = gpd.GeoDataFrame(data=line_frequencies, geometry=line_frequencies.geometry, crs=4326) # Clean the df keep_these = [ - 'route_id', 'route_name', 'direction_id', - 'window', 'min_per_trip', 'ntrips', 'geometry'] + "route_id", + "route_name", + "direction_id", + "window", + "min_per_trip", + "ntrips", + "geometry", + ] - line_frequencies = line_frequencies.loc[ - ~line_frequencies.geometry.isnull(), keep_these] + line_frequencies = line_frequencies.loc[~line_frequencies.geometry.isnull(), keep_these] return line_frequencies - def get_segments(self): """Splits each route's shape into stop-stop LineString called segments Returns the segment geometry as well as additional segment information """ - logging.info('Getting segments...') + logging.info("Getting segments...") stop_times = self.stop_times shapes = self.shapes req_columns = ["shape_id", "stop_sequence", "stop_id", "geometry"] - add_columns = ["route_id", "route_name","direction_id", "stop_name"] + add_columns = ["route_id", "route_name", "direction_id", "stop_name"] # merge stop_times and shapes to calculate cut distance and interpolated point - df_shape_stop = stop_times[req_columns + add_columns].drop_duplicates()\ + df_shape_stop = ( + stop_times[req_columns + add_columns] + .drop_duplicates() .merge(shapes, on="shape_id", suffixes=("_stop", "_shape")) - logging.info('Projecting stops onto shape...') - df_shape_stop["cut_distance_stop_point"] = df_shape_stop[["geometry_stop", "geometry_shape"]]\ - .apply(lambda x: x[1].project(x[0], normalized=True), axis=1) - logging.info('Interpolating stops onto shape...') - df_shape_stop["projected_stop_point"] = df_shape_stop[["geometry_shape", "cut_distance_stop_point"]]\ - .apply(lambda x: x[0].interpolate(x[1], normalized=True), axis=1) - - # calculate cut distance for - logging.info('Sorting shape points and stops...') + ) + logging.info("Projecting stops onto shape...") + df_shape_stop["cut_distance_stop_point"] = df_shape_stop[["geometry_stop", "geometry_shape"]].apply( + lambda x: x[1].project(x[0], normalized=True), axis=1 + ) + logging.info("Interpolating stops onto shape...") + df_shape_stop["projected_stop_point"] = df_shape_stop[["geometry_shape", "cut_distance_stop_point"]].apply( + lambda x: x[0].interpolate(x[1], normalized=True), axis=1 + ) + + # calculate cut distance for + logging.info("Sorting shape points and stops...") df_shape = shapes[shapes.shape_id.isin(stop_times.shape_id.unique())] df_shape["list_of_points"] = df_shape.geometry.apply(lambda x: list(MultiPoint(x.coords).geoms)) df_shape_exp = df_shape.explode("list_of_points") - df_shape_exp["projected_line_points"] = df_shape_exp[["geometry", "list_of_points"]].apply(lambda x: x[0].project(x[1], normalized=True), axis=1) + df_shape_exp["projected_line_points"] = df_shape_exp[["geometry", "list_of_points"]].apply( + lambda x: x[0].project(x[1], normalized=True), axis=1 + ) # rename both dfs to concatenate df_shape_stop.rename( @@ -855,7 +910,7 @@ def get_segments(self): "cut_distance_stop_point": "normalized_distance_along_shape", }, axis=1, - inplace=True + inplace=True, ) df_shape_stop["cut_flag"] = True @@ -866,7 +921,7 @@ def get_segments(self): "projected_line_points": "normalized_distance_along_shape", }, axis=1, - inplace=True + inplace=True, ) df_shape_exp["cut_flag"] = False @@ -879,44 +934,64 @@ def get_segments(self): # drop all non stops (had to combine first fto get their gdf index) cuts = gdf.where(gdf.cut_flag).dropna(subset="cut_flag") cuts = cuts.astype({"shape_id": str, "stop_sequence": int, "direction_id": int}) - cuts[["end_stop_id", "end_stop_name"]] = cuts.groupby("shape_id")[['stop_id', "stop_name"]].shift(-1) + cuts[["end_stop_id", "end_stop_name"]] = cuts.groupby("shape_id")[["stop_id", "stop_name"]].shift(-1) # Create LineString for each stop to stop segment_geometries = [] for shape_id in cuts.shape_id.drop_duplicates(): cut_idx = cuts[cuts.shape_id == shape_id].index for i, cut in enumerate(cut_idx[:-1]): - segment_geometries.append(LineString(gdf.iloc[cut_idx[i]:cut_idx[i+1]+1].geometry)) - - # create into gpd adding additional columns + segment_geometries.append(LineString(gdf.iloc[cut_idx[i] : cut_idx[i + 1] + 1].geometry)) + + # create into gpd adding additional columns segment_df = cuts.dropna(subset="end_stop_id", axis=0) - logging.info(f'segments_df: {len(segment_df)}, geometry: {len(segment_geometries)}') + logging.info(f"segments_df: {len(segment_df)}, geometry: {len(segment_geometries)}") segment_gdf = gpd.GeoDataFrame(segment_df, geometry=segment_geometries) # drop irrelevant columns - segment_gdf.drop(["geometry_shape", "cut_flag", "normalized_distance_along_shape", "geometry_stop"], axis=1, inplace=True) + segment_gdf.drop( + [ + "geometry_shape", + "cut_flag", + "normalized_distance_along_shape", + "geometry_stop", + ], + axis=1, + inplace=True, + ) segment_gdf.crs = "EPSG:4326" # Add segment length in meters - segment_gdf['distance_m'] = segment_gdf.to_crs(code(segment_gdf)).length + segment_gdf["distance_m"] = segment_gdf.to_crs(code(segment_gdf)).length # Add segment_id and name - segment_gdf['segment_id'] = segment_gdf.stop_id.astype(str) + ' - ' + segment_gdf.end_stop_id.astype(str) - segment_gdf['segment_name'] = segment_gdf.stop_name + ' - ' + segment_gdf.end_stop_name + segment_gdf["segment_id"] = segment_gdf.stop_id.astype(str) + " - " + segment_gdf.end_stop_id.astype(str) + segment_gdf["segment_name"] = segment_gdf.stop_name + " - " + segment_gdf.end_stop_name # Order columns col_ordered = [ - 'shape_id', 'route_id', 'route_name','direction_id', - 'stop_sequence', 'segment_name', 'stop_name', 'end_stop_name', 'segment_id','stop_id', 'end_stop_id', - 'distance_m', 'geometry'] - + "shape_id", + "route_id", + "route_name", + "direction_id", + "stop_sequence", + "segment_name", + "stop_name", + "end_stop_name", + "segment_id", + "stop_id", + "end_stop_id", + "distance_m", + "geometry", + ] + segment_gdf = segment_gdf[col_ordered] segment_gdf.rename( - columns=dict(stop_name='start_stop_name', stop_id='start_stop_id'), - inplace=True) + columns=dict(stop_name="start_stop_name", stop_id="start_stop_id"), + inplace=True, + ) return segment_gdf - def get_speeds(self): stop_times = self.stop_times segment_gdf = self.segments @@ -929,14 +1004,26 @@ def get_speeds(self): speeds = add_speed(aux) cols = [ - 'route_name', 'direction_id', 'stop_sequence', - 'segment_name', 'start_stop_name', 'end_stop_name', - 'speed_kmh', 'runtime_sec', 'arrival_time', - 'departure_time', 'distance_m', 'route_id', 'start_stop_id', 'end_stop_id', 'segment_id', 'shape_id', 'geometry' + "route_name", + "direction_id", + "stop_sequence", + "segment_name", + "start_stop_name", + "end_stop_name", + "speed_kmh", + "runtime_sec", + "arrival_time", + "departure_time", + "distance_m", + "route_id", + "start_stop_id", + "end_stop_id", + "segment_id", + "shape_id", + "geometry", ] - + return speeds[cols] - def get_avg_speeds(self): """ @@ -963,82 +1050,110 @@ def get_avg_speeds(self): # Do we want a geodataframe? if self.geo: - data = gpd.GeoDataFrame( - data=data, - geometry=data.geometry, - crs=4326) - + data = gpd.GeoDataFrame(data=data, geometry=data.geometry, crs=4326) + ordered_cols = [ - 'route_id', 'route_name', 'direction_id', 'stop_sequence', - 'segment_name', 'window', - 'speed_kmh', 'avg_route_speed_kmh', 'segment_max_speed_kmh', 'runtime_sec', - 'start_stop_name', 'end_stop_name', 'segment_id', 'start_stop_id', 'end_stop_id', - 'shape_id', 'distance_m', 'geometry'] - - return data[ordered_cols] + "route_id", + "route_name", + "direction_id", + "stop_sequence", + "segment_name", + "window", + "speed_kmh", + "avg_route_speed_kmh", + "segment_max_speed_kmh", + "runtime_sec", + "start_stop_name", + "end_stop_name", + "segment_id", + "start_stop_id", + "end_stop_id", + "shape_id", + "distance_m", + "geometry", + ] + return data[ordered_cols] def get_segments_freq(self): - + stop_times = self.stop_times segment_gdf = self.segments cutoffs = self.time_windows - if 'window' not in stop_times.columns: + if "window" not in stop_times.columns: stop_times = window_creation(stop_times, cutoffs) else: - stop_times['window'] = stop_times.window.astype(str) + stop_times["window"] = stop_times.window.astype(str) # Get labels labels = label_creation(cutoffs) # Aggregate trips line_frequencies = add_frequency( - stop_times, labels, index_=['route_id', 'route_name', 'stop_id'], - col='window', cutoffs=cutoffs) + stop_times, + labels, + index_=["route_id", "route_name", "stop_id"], + col="window", + cutoffs=cutoffs, + ) keep_these = [ - 'route_id', 'route_name', 'segment_name', - 'start_stop_name', 'end_stop_name', - 'segment_id', 'start_stop_id', 'end_stop_id', - 'direction_id', 'geometry'] + "route_id", + "route_name", + "segment_name", + "start_stop_name", + "end_stop_name", + "segment_id", + "start_stop_id", + "end_stop_id", + "direction_id", + "geometry", + ] line_frequencies = pd.merge( line_frequencies, segment_gdf[keep_these], - left_on=['route_id', 'route_name', 'stop_id', 'direction_id'], - right_on=['route_id', 'route_name', 'start_stop_id', 'direction_id'], - how='left') - - line_frequencies.drop('stop_id', axis=1, inplace=True) + left_on=["route_id", "route_name", "stop_id", "direction_id"], + right_on=["route_id", "route_name", "start_stop_id", "direction_id"], + how="left", + ) + + line_frequencies.drop("stop_id", axis=1, inplace=True) # Remove duplicates after merging line_frequencies.drop_duplicates(inplace=True) # Aggregate for all lines - data_complete = add_all_lines( - line_frequencies, segment_gdf, labels, cutoffs) + data_complete = add_all_lines(line_frequencies, segment_gdf, labels, cutoffs) # Do we want a geodataframe? if self.geo is True: data_complete = gpd.GeoDataFrame( - data=data_complete.drop('geometry', axis=1), - geometry=data_complete.geometry) + data=data_complete.drop("geometry", axis=1), + geometry=data_complete.geometry, + ) # Clean data keep_these = [ - 'route_id', 'route_name', - 'direction_id', - 'segment_name', 'start_stop_name', 'end_stop_name', - 'window', 'min_per_trip', 'ntrips', - 'start_stop_id', 'end_stop_id', 'segment_id', - 'geometry' + "route_id", + "route_name", + "direction_id", + "segment_name", + "start_stop_name", + "end_stop_name", + "window", + "min_per_trip", + "ntrips", + "start_stop_id", + "end_stop_id", + "segment_id", + "geometry", ] data_complete = data_complete.loc[~data_complete.geometry.isnull()][keep_these] return data_complete - def get_distance_between_stops(self): """ @@ -1047,51 +1162,49 @@ def get_distance_between_stops(self): From what I see, the optimal resolution for our purposes is resolution=9. Taking the Hex bin at this resolution and its neighbors works as a better - clustering method than DBSCAN. + clustering method than DBSCAN. We can then only calculate the distance between each stop and the ones that are in the neighboring hex bins. """ stops_ = self.stops.copy() - logging.info('Getting hex bins.') + logging.info("Getting hex bins.") RESOLUTION = 9 - stops_hash = stops_.to_dict()['stop_id'] + stops_hash = stops_.to_dict()["stop_id"] stops_.reset_index(inplace=True) - stops_.rename(columns=dict(index = 'stop_index'), inplace=True) + stops_.rename(columns=dict(index="stop_index"), inplace=True) - stops_['hex'] = stops_.apply( - lambda row: geo_to_h3(row.stop_lat, row.stop_lon, RESOLUTION), axis=1) + stops_["hex"] = stops_.apply(lambda row: latlng_to_cell(row.stop_lat, row.stop_lon, RESOLUTION), axis=1) # stops.head() # Stops to utm for distance calculatin - utm_stops = stops_[ - ['stop_index', 'stop_id', 'stop_name', 'hex', 'geometry']].to_crs(code(stops_)) + utm_stops = stops_[["stop_index", "stop_id", "stop_name", "hex", "geometry"]].to_crs(code(stops_)) # # Hash for faster results # stops_h3_hash = stops.set_index('stop_index').to_dict()['hex'] # h3_stops_hash = stops.set_index('hex').to_dict()['stop_index'] # Stops in Hexbins - h3_stops = stops_.groupby('hex').stop_index.apply(list) - h3_geos = utm_stops.groupby('hex').geometry.apply(list) + h3_stops = stops_.groupby("hex").stop_index.apply(list) + h3_geos = utm_stops.groupby("hex").geometry.apply(list) # Unique hex - h3_neighbors = {hex: k_ring(hex, k=1) for hex in stops_.hex.unique()} + h3_neighbors = {hex: grid_ring(hex, k=1) for hex in stops_.hex.unique()} st = time() stops_comb = [] distances = [] - logging.info('Looking for stop distances') + logging.info("Looking for stop distances") for hex, h3_group in h3_neighbors.items(): s_index = h3_stops[h3_stops.index.isin(h3_group)].values s_geos = h3_geos[h3_geos.index.isin(h3_group)].values - + stops_list = list(chain.from_iterable(s_index)) geo_list = list(chain.from_iterable(s_geos)) geo_perm = list(permutations(geo_list, 2)) @@ -1100,88 +1213,63 @@ def get_distance_between_stops(self): distances.extend([distance(pair[0], pair[1]) for pair in geo_perm]) # Make dataframe - dist_df = pd.DataFrame(data=stops_comb, columns=['stop_index_1', 'stop_index_2']) - dist_df['distance_m'] = distances - dist_df.drop_duplicates(subset=['stop_index_1', 'stop_index_2'], inplace=True) + dist_df = pd.DataFrame(data=stops_comb, columns=["stop_index_1", "stop_index_2"]) + dist_df["distance_m"] = distances + dist_df.drop_duplicates(subset=["stop_index_1", "stop_index_2"], inplace=True) et = time() # print(f'Calculating distances took {et-st} seconds') - logging.info(f'Calculating distances took {et-st} seconds') - - logging.info(f'Calculate walking times') + logging.info(f"Calculating distances took {et-st} seconds") + logging.info("Calculate walking times") # Calculate walking times # Assume 1.11 m/s as average walking speed as the literature suggests (4km/h=1.11 m/s) walking_speed_ms = 4 / 3.6 - dist_df['connection_time_min'] =\ - dist_df.distance_m * walking_speed_ms / 60 - + dist_df["connection_time_min"] = dist_df.distance_m * walking_speed_ms / 60 + # Add stop_id to distance matrix - dist_df['stop_id_1'] = dist_df.stop_index_1.apply(lambda x: stops_hash[x]) - dist_df['stop_id_2'] = dist_df.stop_index_2.apply(lambda x: stops_hash[x]) - + dist_df["stop_id_1"] = dist_df.stop_index_1.apply(lambda x: stops_hash[x]) + dist_df["stop_id_2"] = dist_df.stop_index_2.apply(lambda x: stops_hash[x]) + return dist_df def extract_file(file, feed): - data_types = { - 'shape_id': str, - 'stop_id': str, - 'route_id': str, - 'trip_id': str - } + data_types = {"shape_id": str, "stop_id": str, "route_id": str, "trip_id": str} files = feed.files gtfs_path = feed.gtfs_path - # check if the the zip file came from a zipped folder - if len(files[0].split('/')) == 1: + # check if the the zip file came from a zipped folder + if len(files[0].split("/")) == 1: file_path = f"{file}.txt" else: file_path = f"{files[0].split('/')[0]}/{file}.txt" - # S3 implementation - if gtfs_path.split('://')[0]=='s3': - s3 = boto3.resource('s3') - bucket = gtfs_path.split('://')[1].split('/')[0] - boto_bucket = s3.Bucket(bucket) - key = '/'.join(gtfs_path.split('/')[3:]) - - with io.BytesIO() as data: - boto_bucket.download_fileobj(key, data) - try: - with ZipFile(data) as myzip: - logging.info(f'Reading "{file}.txt".') - myzip.extract(file_path, path='/tmp') - data = pd.read_csv(f'/tmp/{file_path}', dtype=data_types) - - os.remove(f"/tmp/{file_path}") - return data - except (FileNotFoundError, OSError, KeyError) as e: - return logging.info(f'File "{file}.txt" not found.') - else: - try: - if file_path in files: - with ZipFile(gtfs_path) as myzip: - logging.info(f'Reading "{file}.txt".') - myzip.extract(file_path, path='/tmp') - data = pd.read_csv(f'/tmp/{file_path}', dtype=data_types) - - os.remove(f"/tmp/{file_path}") - return data - else: - return logging.info(f'File "{file}.txt" not found.') - - # Try as a URL - except (FileNotFoundError, OSError) as e: - if f'{file}.txt' in files: - r = requests.get(gtfs_path) - with ZipFile(io.BytesIO(r.content)) as myzip: - logging.info(f'Reading "{file}.txt".') - myzip.extract(f"{file_path}", path='/tmp') - data = pd.read_csv(f'/tmp/{file_path}', dtype=data_types) - - os.remove(f"/tmp/{file_path}") - return data - else: - return logging.info(f'File "{file}.txt" not found.') + # Try as a local file + try: + if file_path in files: + with ZipFile(gtfs_path) as myzip: + logging.info(f'Reading "{file}.txt".') + myzip.extract(file_path, path="/tmp") + data = pd.read_csv(f"/tmp/{file_path}", dtype=data_types) + + os.remove(f"/tmp/{file_path}") + return data + else: + return logging.info(f'File "{file}.txt" not found.') + + # Try as a URL + except (FileNotFoundError, OSError) as e: + logging.error(e) + if f"{file}.txt" in files: + r = requests.get(gtfs_path) + with ZipFile(io.BytesIO(r.content)) as myzip: + logging.info(f'Reading "{file}.txt".') + myzip.extract(f"{file_path}", path="/tmp") + data = pd.read_csv(f"/tmp/{file_path}", dtype=data_types) + + os.remove(f"/tmp/{file_path}") + return data + else: + return logging.info(f'File "{file}.txt" not found.') diff --git a/gtfs_functions/gtfs_plots.py b/gtfs_functions/gtfs_plots.py index 93b8de2..e16d4d0 100644 --- a/gtfs_functions/gtfs_plots.py +++ b/gtfs_functions/gtfs_plots.py @@ -7,24 +7,25 @@ import logging import warnings + warnings.filterwarnings("ignore") def map_gdf( - gdf, - variable='min_per_trip', - colors=["#d13870", "#e895b3" , '#55d992', '#3ab071', '#0e8955', '#066a40'], - tooltip_var=['min_per_trip'], - tooltip_labels=['Headway: '], - breaks=[] - ): + gdf, + variable="min_per_trip", + colors=["#d13870", "#e895b3", "#55d992", "#3ab071", "#0e8955", "#066a40"], + tooltip_var=["min_per_trip"], + tooltip_labels=["Headway: "], + breaks=[], +): gdf.reset_index(inplace=True, drop=True) # Look for the center of the map minx, miny, maxx, maxy = gdf.geometry.total_bounds - centroid_lat = miny + (maxy - miny)/2 - centroid_lon = minx + (maxx - minx)/2 + centroid_lat = miny + (maxy - miny) / 2 + centroid_lon = minx + (maxx - minx) / 2 if isinstance(gdf[variable].values[0], str): categorical = True @@ -36,74 +37,80 @@ def map_gdf( breaks = jenkspy.jenks_breaks(gdf[variable], n_classes=len(colors)) breaks = [int(b) for b in breaks] - m = folium.Map(location=[centroid_lat, centroid_lon], - tiles='cartodbpositron', zoom_start=12 - ) + m = folium.Map(location=[centroid_lat, centroid_lon], tiles="cartodbpositron", zoom_start=12) # If the variable is categorical if categorical: - gdf['radius'] = 5 + gdf["radius"] = 5 # We start with Remix Lightrail colors # and then add default colors from Plotly qualitative_palette = [ - '#0066a1', '#a92023', '#066a40', - '#e89b01', '#613fa6', '#024b50', - '#a72051', '#a72f00', '#476800'] + "#0066a1", + "#a92023", + "#066a40", + "#e89b01", + "#613fa6", + "#024b50", + "#a72051", + "#a72f00", + "#476800", + ] color_palette = ( qualitative_palette + px.colors.qualitative.Pastel + px.colors.qualitative.Prism + px.colors.qualitative.Vivid - + px.colors.qualitative.Light24) + + px.colors.qualitative.Light24 + ) + + fill_color = pd.DataFrame( + dict( + variable=gdf[variable].unique(), + fill_color=color_palette[0 : len(gdf[variable].unique())], + ) + ) - fill_color = pd.DataFrame(dict( - variable=gdf[variable].unique(), - fill_color=color_palette[0:len(gdf[variable].unique())])) + gdf = pd.merge(gdf, fill_color, left_on=variable, right_on=variable, how="left") - gdf = pd.merge( - gdf, fill_color, - left_on=variable, right_on=variable, how='left') - # If the variable is numerical else: - gdf['radius'] = gdf[variable] / gdf[variable].max() * 10 + gdf["radius"] = gdf[variable] / gdf[variable].max() * 10 index = [int(b) for b in breaks] - colorscale = branca.colormap.StepColormap( - colors, index=index, caption=variable) - gdf['fill_color'] = gdf[variable].apply(lambda x: colorscale(x)) + colorscale = branca.colormap.StepColormap(colors, index=index, caption=variable) + gdf["fill_color"] = gdf[variable].apply(lambda x: colorscale(x)) - if gdf.geom_type.values[0] == 'Point': + if gdf.geom_type.values[0] == "Point": # my code for circles # Create the circles for i in range(int(len(gdf))): folium.CircleMarker( - location=[gdf.loc[i, 'geometry'].y, gdf.loc[i, 'geometry'].x], - radius=float(gdf.loc[i, 'radius']), - tooltip=tooltip_labels[0] + str(gdf.loc[i, tooltip_var[0]]) + ' min', - color='#ffffff00', + location=[gdf.loc[i, "geometry"].y, gdf.loc[i, "geometry"].x], + radius=float(gdf.loc[i, "radius"]), + tooltip=tooltip_labels[0] + str(gdf.loc[i, tooltip_var[0]]) + " min", + color="#ffffff00", fill=True, - fill_opacity=.7, - fill_color=str(gdf.loc[i, 'fill_color']) + fill_opacity=0.7, + fill_color=str(gdf.loc[i, "fill_color"]), ).add_to(m) else: # Styling function for LineStrings def style_function(feature): return { - 'fillOpacity': 0.5, - 'weight': 3, # math.log2(feature['properties']['speed'])*2, - 'color': feature['properties']['fill_color']} + "fillOpacity": 0.5, + "weight": 3, # math.log2(feature['properties']['speed'])*2, + "color": feature["properties"]["fill_color"], + } + # my code for lines geo_data = gdf.__geo_interface__ folium.GeoJson( geo_data, style_function=style_function, tooltip=folium.features.GeoJsonTooltip( - fields=tooltip_var, - aliases=tooltip_labels, - labels=True, - sticky=False) - ).add_to(m) + fields=tooltip_var, aliases=tooltip_labels, labels=True, sticky=False + ), + ).add_to(m) return m diff --git a/pyproject.toml b/pyproject.toml index 610dbdc..42e7a36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,35 +1,15 @@ [build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" +requires = ["setuptools>=43.0.0", "wheel"] +build-backend = "setuptools.build_meta" -[project] -name = "gtfs-functions" -version = "2.5" -authors = [ - { name="Santiago Toso", email="santiagoa.toso@gmail.com" }, -] -description = "Package to easily wrangle GTFS files geospatially." -readme = "README.md" -requires-python = ">=3.8" -classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", -] +[tool.isort] +profile = "black" +line_length = 120 +multi_line_output = 3 +include_trailing_comma = true +filter_files = true +skip = ".venv,.mypy_cache" -dependencies = [ - "pandas", - "geopandas", - "shapely", "utm>=0.7.0", - #"numpy>=1.24.2", - 'numpy', - "pendulum>=3.0.0", - "branca>=0.6.0", - "plotly>=5.13.0", "jenkspy>=0.3.2", "folium>=0.14.0", - "unicode>=2.9", - "h3", "boto3" -] - -[project.urls] -"Homepage" = "https://github.com/Bondify/gtfs_functions/tree/master" -"Bug Tracker" = "https://github.com/Bondify/gtfs_functions/issues" +[tool.black] +line-length = 120 +exclude = "(\\.git|\\.mypy_cache|\\.venv|_build|buck-out|build|dist)" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..5fbfdae --- /dev/null +++ b/setup.cfg @@ -0,0 +1,21 @@ +[metadata] +version = 2.7.0 + +[flake8] +select = B,B9,C,DAR,E,F,N,RST,S,W +extend-ignore = E704,W503,E203,C901 +max-line-length = 120 +max-complexity = 10 +docstring-convention = google +per-file-ignores = tests/*:S101 +rst-roles = class,const,func,meth,mod,ref +rst-directives = deprecated +exclude = + notebooks + ./notebooks + notebooks/* + ./notebooks/* + .venv + gtfs_functions/__init__.py + data/* + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ae283ba --- /dev/null +++ b/setup.py @@ -0,0 +1,40 @@ +from setuptools import setup, find_packages + +setup( + name="gtfs-functions", + description="Package to easily wrangle GTFS files geospatially.", + project_urls={ + "Source": "https://github.com/Bondify/gtfs_functions/tree/master", + "Tracker": "https://github.com/Bondify/gtfs_functions/issues", + }, + author="Santiago Toso", + author_email="santiagoa.toso@gmail.com", + packages=find_packages(where="gtfs_functions"), + package_dir={"gtfs_functions": "gtfs_functions"}, + python_requires=">=3.8, <4", + install_requires=[ + # Data wrangling + "pandas", + "numpy", + "pendulum>=3.0.0", + # Geo + "geopandas", + "shapely", + "utm>=0.7.0", + "h3>3.7.7", + "haversine", + # Plotting + "branca>=0.6.0", + "plotly>=5.13.0", + "jenkspy>=0.3.2", + "folium>=0.14.0", + "unicode>=2.9", + ], + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + keywords="gtfs", +)