"""Get neighbors and K-nearest neighbors.
The functions rely on the pandas index as identifiers for the geometries and their
neighbors. This makes it easy to join or aggregate the results onto the input
GeoDataFrames.
The results of all functions will be identical with GeoDataFrame and GeoSeries as input
types.
"""
import numpy as np
import shapely
from geopandas import GeoDataFrame
from geopandas import GeoSeries
from pandas import DataFrame
from pandas import MultiIndex
from pandas import Series
from sklearn.neighbors import NearestNeighbors
from .conversion import coordinate_array
from .geometry_types import get_geom_type
[docs]
def get_neighbor_indices(
gdf: GeoDataFrame | GeoSeries,
neighbors: GeoDataFrame | GeoSeries,
max_distance: int = 0,
predicate: str = "intersects",
) -> Series:
"""Creates a pandas Series with the index of 'gdf' and values of 'neighbors'.
Finds all the geometries in 'neighbors' that intersect with 'gdf' and returns a
Series where the values are the 'neighbors' indices and the index is the indices of
'gdf'. Use set_index inside the function call to get values from a column instead of
the current index.
Args:
gdf: GeoDataFrame or GeoSeries.
neighbors: GeoDataFrame or GeoSeries.
max_distance: The maximum distance between the geometries. Defaults to 0.
predicate: Spatial predicate to use in sjoin. Defaults to "intersects", meaning
the geometry itself and geometries within will be considered neighbors if
they are part of the 'neighbors' GeoDataFrame.
Returns:
A pandas Series with values of the intersecting 'neighbors' indices.
The Series' index will follow the index of 'gdf'.
Raises:
ValueError: If gdf and neighbors do not have the same coordinate reference
system.
Examples:
---------
>>> from sgis import get_neighbor_indices, to_gdf
>>> points = to_gdf([(0, 0), (0.5, 0.5)])
>>> points
geometry
0 POINT (0.00000 0.00000)
1 POINT (0.50000 0.50000)
With the default max_distance (0), the points return the index of themselves.
>>> neighbor_indices = get_neighbor_indices(points, points)
>>> neighbor_indices
0 0
1 1
Name: neighbor_index, dtype: int64
With max_distance=1, each point find themselves and the neighbor.
>>> neighbor_indices = get_neighbor_indices(points, points, max_distance=1)
>>> neighbor_indices
0 0
1 0
0 1
1 1
Name: neighbor_index, dtype: int64
Using a column instead of the index.
>>> points["text"] = [*"ab"]
>>> neighbor_indices = get_neighbor_indices(points, points.set_index("text"), max_distance=1)
>>> neighbor_indices
0 a
1 a
0 b
1 b
Name: neighbor_index, dtype: object
The returned Series will always keep the index of 'gdf' and have values of the
'neighbors' index.
>>> neighbor_indices.index
Int64Index([0, 1, 0, 1], dtype='int64')
>>> neighbor_indices.values
['a' 'a' 'b' 'b']
"""
if isinstance(gdf.index, MultiIndex) or isinstance(neighbors.index, MultiIndex):
raise ValueError("get_neighbor_indices not implemented for pandas.MultiIndex")
if gdf.crs != neighbors.crs:
raise ValueError(f"'crs' mismatch. Got {gdf.crs} and {neighbors.crs}")
if isinstance(neighbors, GeoSeries):
neighbors = neighbors.to_frame()
else:
neighbors = neighbors[[neighbors._geometry_column_name]]
# buffer and keep only geometry column
if max_distance and predicate != "nearest":
gdf = gdf.buffer(max_distance).to_frame("geometry")
else:
gdf = gdf.geometry.to_frame("geometry")
neighbors.index.name = None
gdf.index.name = None
if predicate == "nearest":
max_distance = None if max_distance == 0 else max_distance
joined = gdf.sjoin_nearest(neighbors, how="inner", max_distance=max_distance)
else:
joined = gdf.sjoin(neighbors, how="inner", predicate=predicate)
return joined.rename(columns={"index_right": "neighbor_index"})["neighbor_index"]
def get_neighbor_dfs(
df: GeoDataFrame | DataFrame,
neighbor_mapper: Series,
) -> list[GeoDataFrame | DataFrame]:
return [
df[df.index.isin(neighbor_mapper[i])] for i in neighbor_mapper.index.unique()
]
[docs]
def get_all_distances(
gdf: GeoDataFrame | GeoSeries, neighbors: GeoDataFrame | GeoSeries
) -> DataFrame:
"""Get distances from 'gdf' to all points in 'neighbors'.
Find the distance from each point in 'gdf' to each point in 'neighbors'. Also
returns the indices of 'neighbors' (as the column 'neighbor_index') and 'gdf'
(as the index).
Args:
gdf: a GeoDataFrame of points.
neighbors: a GeoDataFrame of points.
Returns:
DataFrame with the columns 'neighbor_index' and 'distance'. The index follows
the index of 'gdf'.
Raises:
ValueError: If the coordinate reference system of 'gdf' and 'neighbors' are
not the same.
Examples:
---------
>>> from sgis import get_all_distances, random_points
>>> points = random_points(100)
>>> neighbors = random_points(100)
>>> distances = get_all_distances(points, neighbors)
>>> distances
neighbor_index distance
0 70 0.050578
0 24 0.070267
0 91 0.088510
0 72 0.095352
0 40 0.103720
.. ... ...
99 27 0.713055
99 60 0.718162
99 63 0.719675
99 62 0.719747
99 90 0.761324
[10000 rows x 2 columns]
Use set_index to get values from other columns.
>>> neighbors["custom_id"] = np.random.choice([*"abcde"], len(neighbors))
>>> distances = get_all_distances(points, neighbors.set_index("custom_id"))
neighbor_index distance
0 d 0.050578
0 b 0.070267
0 e 0.088510
0 d 0.095352
0 c 0.103720
.. ... ...
99 b 0.713055
99 d 0.718162
99 d 0.719675
99 d 0.719747
99 e 0.761324
[10000 rows x 2 columns]
Since the index from 'gdf' is preserved, we can join the results with the 'points'.
>>> joined = points.join(distances)
>>> joined
geometry neighbor_index distance
0 POINT (0.59809 0.34636) d 0.050578
0 POINT (0.59809 0.34636) b 0.070267
0 POINT (0.59809 0.34636) e 0.088510
0 POINT (0.59809 0.34636) d 0.095352
0 POINT (0.59809 0.34636) c 0.103720
.. ... ... ...
99 POINT (0.35305 0.47445) b 0.713055
99 POINT (0.35305 0.47445) d 0.718162
99 POINT (0.35305 0.47445) d 0.719675
99 POINT (0.35305 0.47445) d 0.719747
99 POINT (0.35305 0.47445) e 0.761324
[10000 rows x 3 columns]
Or assign aggregated values onto the points.
>>> points["mean_distance"] = distances.groupby(level=0)["distance"].mean()
>>> points["min_distance"] = distances.groupby(level=0)["distance"].min()
>>> points
geometry mean_distance min_distance
0 POINT (0.59809 0.34636) 0.417128 0.050578
1 POINT (0.25444 0.02876) 0.673966 0.016781
2 POINT (0.22475 0.08637) 0.643514 0.030049
3 POINT (0.14814 0.23037) 0.593224 0.025758
4 POINT (0.69298 0.81931) 0.434355 0.051575
.. ... ... ...
95 POINT (0.62453 0.26793) 0.460177 0.031749
96 POINT (0.11882 0.26615) 0.592930 0.044010
97 POINT (0.03998 0.77527) 0.592031 0.090983
98 POINT (0.46047 0.79056) 0.400134 0.016012
99 POINT (0.35305 0.47445) 0.397660 0.052134
[100 rows x 3 columns]
"""
return get_k_nearest_neighbors(
gdf=gdf,
neighbors=neighbors,
k=len(neighbors),
)
[docs]
def sjoin_within_distance(
gdf: GeoDataFrame | GeoSeries,
neighbors: GeoDataFrame | GeoSeries,
distance: int | float,
distance_col: str = "distance",
**kwargs,
) -> GeoDataFrame:
"""Sjoin with a buffer on the right GeoDataFrame and adds a distance column."""
new_neighbor_cols = {"__left_range_idx": range(len(neighbors))}
if distance:
new_neighbor_cols[neighbors._geometry_column_name] = lambda x: x.buffer(
distance
)
# using assign to get a copy
neighbors = neighbors.assign(**new_neighbor_cols)
out = gdf.sjoin(neighbors, **kwargs)
out[distance_col] = shapely.distance(
out.geometry.values, neighbors.geometry.iloc[out["__left_range_idx"]].values
)
return out.drop(columns="__left_range_idx")
[docs]
def get_k_nearest_neighbors(
gdf: GeoDataFrame | GeoSeries,
neighbors: GeoDataFrame | GeoSeries,
k: int,
*,
strict: bool = False,
) -> DataFrame:
"""Finds the k nearest neighbors for a GeoDataFrame of points.
Uses the K-nearest neighbors algorithm method from scikit-learn to find the given
number of neighbors for each point in 'gdf'. Identical points are considered
neighbors. Preserves the index of 'gdf' and adds the column 'neighbor_index'
with the indices of the neighbors, as well as a column 'distance'.
Args:
gdf: a GeoDataFrame of points
neighbors: a GeoDataFrame of points
k: number of neighbors to find.
strict: If False (default), no exception is raised if k is larger than the
number of points in 'neighbors'. If True, 'k' must be less than or equal
to the number of points in 'neighbors'.
Returns:
DataFrame with the columns 'neighbor_index' and 'distance'. The index follows
the index of 'gdf'.
Raises:
ValueError: If the coordinate reference system of 'gdf' and 'neighbors' are
not the same.
Examples:
---------
Make some random points.
>>> from sgis import get_k_nearest_neighbors, random_points
>>> points = random_points(100)
>>> neighbors = random_points(100)
Get 10 nearest neighbors.
>>> distances = get_k_nearest_neighbors(points, neighbors, k=10)
>>> distances
neighbor_index distance
0 84 0.049168
0 59 0.053592
0 14 0.091812
0 40 0.118403
0 77 0.129565
.. ... ...
99 86 0.153771
99 92 0.157481
99 70 0.177368
99 65 0.184087
99 26 0.202216
[1000 rows x 2 columns]
Use set_index to use another column as identifier for the neighbors.
>>> neighbors["custom_id"] = [letter for letter in [*"abcde"] for _ in range(20)]
>>> distances = get_k_nearest_neighbors(points, neighbors.set_index("custom_id"), k=10)
>>> distances
neighbor_index distance
0 e 0.049168
0 c 0.053592
0 a 0.091812
0 c 0.118403
0 d 0.129565
.. ... ...
99 e 0.153771
99 e 0.157481
99 d 0.177368
99 d 0.184087
99 b 0.202216
[1000 rows x 2 columns]
The index from 'points' is preserved. Use join to get the distance and neighbor
index onto the 'points' GeoDataFrame.
>>> joined = points.join(distances)
>>> joined["k"] = joined.groupby(level=0)["distance"].transform("rank")
>>> joined
geometry neighbor_index distance k
0 POINT (0.89067 0.75346) e 0.049168 1.0
0 POINT (0.89067 0.75346) c 0.053592 2.0
0 POINT (0.89067 0.75346) a 0.091812 3.0
0 POINT (0.89067 0.75346) c 0.118403 4.0
0 POINT (0.89067 0.75346) d 0.129565 5.0
.. ... ... ... ...
99 POINT (0.65910 0.16714) e 0.153771 6.0
99 POINT (0.65910 0.16714) e 0.157481 7.0
99 POINT (0.65910 0.16714) d 0.177368 8.0
99 POINT (0.65910 0.16714) d 0.184087 9.0
99 POINT (0.65910 0.16714) b 0.202216 10.0
[1000 rows x 4 columns]
Or assign aggregated values directly onto the points.
>>> points["mean_distance"] = distances.groupby(level=0)["distance"].mean()
>>> points["min_distance"] = distances.groupby(level=0)["distance"].min()
>>> points
geometry mean_distance min_distance
0 POINT (0.89067 0.75346) 0.132193 0.049168
1 POINT (0.41308 0.09462) 0.116610 0.009072
2 POINT (0.13458 0.44248) 0.100539 0.059576
3 POINT (0.32670 0.44102) 0.117730 0.056133
4 POINT (0.82184 0.41231) 0.106685 0.013174
.. ... ... ...
95 POINT (0.29706 0.27520) 0.137398 0.079672
96 POINT (0.42416 0.26956) 0.160817 0.074759
97 POINT (0.98337 0.54492) 0.164551 0.070798
98 POINT (0.42458 0.77459) 0.127562 0.027662
99 POINT (0.65910 0.16714) 0.143257 0.058453
[100 rows x 3 columns]
"""
if not len(gdf) or not len(neighbors):
return DataFrame(columns=["neighbor_index", "distance"])
if gdf.crs != neighbors.crs:
raise ValueError("crs mismatch:", gdf.crs, "and", neighbors.crs)
if get_geom_type(gdf) != "point" or get_geom_type(neighbors) != "point":
raise ValueError("Geometries must be points")
# using the range index
idx_dict_gdf = {i: col for i, col in zip(range(len(gdf)), gdf.index, strict=True)}
id_dict_neighbors = {
i: col for i, col in zip(range(len(neighbors)), neighbors.index, strict=True)
}
gdf_array = coordinate_array(gdf)
neighbors_array = coordinate_array(neighbors)
dists, neighbor_indices = k_nearest_neighbors(gdf_array, neighbors_array, k, strict)
edges = _get_edges(gdf, neighbor_indices)
edges = edges[dists >= 0]
dists = dists[dists >= 0]
df = DataFrame(edges, columns=["tmp__idx__", "neighbor_index"])
df["distance"] = dists
df["tmp__idx__"] = df["tmp__idx__"].map(idx_dict_gdf)
df["neighbor_index"] = df["neighbor_index"].map(id_dict_neighbors)
df = df.set_index("tmp__idx__")
df.index.name = gdf.index.name
return df
[docs]
def k_nearest_neighbors(
from_array: np.ndarray[np.ndarray[float]],
to_array: np.ndarray[np.ndarray[float]],
k: int | None = None,
strict: bool = False,
) -> tuple[np.ndarray[float], np.ndarray[int]]:
"""Finds nearest neighbors for an array of coordinates to another array of coordinates.
Args:
from_array: Numpy array of coordinates.
to_array: Numpy array of coordinates.
k: Number of neighbors to find.
strict: If True (not default), an exception is raised
if k is larger than the length of 'to_array'.
Returns a tuple of arrays, one with distances and one with indices
of the neighbors.
"""
if not len(to_array) or not len(from_array):
return np.array([]), np.array([])
if k is None:
k = len(to_array)
if not strict:
k = k if len(to_array) >= k else len(to_array)
nbr = NearestNeighbors(n_neighbors=k, algorithm="ball_tree").fit(to_array)
distances, indices = nbr.kneighbors(from_array)
return distances, indices
def _get_edges(
gdf: GeoDataFrame | GeoSeries, indices: np.ndarray[int]
) -> np.ndarray[tuple[int]]:
"""Takes a GeoDataFrame and array of indices, and returns a 2d array of edges.
Args:
gdf (GeoDataFrame): GeoDataFrame
indices (np.ndarray[float]): a numpy array of the indices of the nearest
neighbors for each point in the GeoDataFrame.
Returns:
A 2d numpy array of edges (from-to indices).
"""
row_indices = np.arange(len(indices)).reshape(-1, 1)
return np.stack((np.broadcast_to(row_indices, indices.shape), indices), axis=-1)