skmap.misc.sample_groups#

sample_groups(points, *group_element_columns, spatial_resolution=None, temporal_resolution=None, date_column='date')[source]#

Construct group IDs for spatial and temporal cross-validation.

Groups point samples into tiles of spatial_resolution width and height and/or intervals of temporal_resolution size. group_element_columns are also concatenated into the final group ID of each sample.

Parameters:

points (GeoDataFrame) – GeoDataFrame containing point samples.
*group_element_columns –
Names of additional columns to be concatenated into the final group IDs.
spatial_resolution (Union[int, float, None]) – Tile size (both x and y) for grouping, in sample CRS units.
temporal_resolution (Optional[timedelta]) – Interval size for grouping.
date_column (str) – Name of the column containing sample timestamps (as datetime objects).

Return type:

ndarray

Returns:

1D string array containing the group id of each sample.

Examples

>>> import geopandas as gp
>>> import shapely
>>> import numpy as np
>>> from datetime import datetime, timedelta
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.model_selection import cross_val_score, GroupKFold
>>>
>>> from skmap.misc import sample_groups
>>>
>>> np.random.seed(42)
>>>
>>> # construct some synthetic point data
>>> coords = np.random.random((1000, 2)) * 4000
>>> dates = datetime.now() + np.array([*map(
...         timedelta,
...         range(1000),
... )])
>>>
>>> points = gp.GeoDataFrame({
...         'geometry': shapely.points(coords),
...         'date': dates,
...         'group': np.random.choice(['a', 'b'], size=1000),
...         'predictor': np.random.random(1000),
...         'target': np.random.randint(2, size=1000),
... })
>>>
>>> # get the point groups
>>> groups = sample_groups(
...         points,
...         'group',
...         spatial_resolution=1000,
...         temporal_resolution=timedelta(days=365),
... )
>>>
>>> print(np.unique(groups))
['ax0y0t0' 'ax0y0t1' 'ax0y0t2' 'ax0y1t0' 'ax0y1t1' 'ax0y1t2' 'ax0y2t0'
 'ax0y2t1' 'ax0y2t2' 'ax0y3t0' 'ax0y3t1' 'ax0y3t2' 'ax1y0t0' 'ax1y0t1'
 'ax1y0t2' 'ax1y1t0' 'ax1y1t1' 'ax1y1t2' 'ax1y2t0' 'ax1y2t1' 'ax1y2t2'
 'ax1y3t0' 'ax1y3t1' 'ax1y3t2' 'ax2y0t0' 'ax2y0t1' 'ax2y0t2' 'ax2y1t0'
 'ax2y1t1' 'ax2y1t2' 'ax2y2t0' 'ax2y2t1' 'ax2y2t2' 'ax2y3t0' 'ax2y3t1'
 'ax2y3t2' 'ax3y0t0' 'ax3y0t1' 'ax3y0t2' 'ax3y1t0' 'ax3y1t1' 'ax3y1t2'
 'ax3y2t0' 'ax3y2t1' 'ax3y2t2' 'ax3y3t0' 'ax3y3t1' 'ax3y3t2' 'bx0y0t0'
 'bx0y0t1' 'bx0y0t2' 'bx0y1t0' 'bx0y1t1' 'bx0y1t2' 'bx0y2t0' 'bx0y2t1'
 'bx0y2t2' 'bx0y3t0' 'bx0y3t1' 'bx0y3t2' 'bx1y0t0' 'bx1y0t1' 'bx1y0t2'
 'bx1y1t0' 'bx1y1t1' 'bx1y1t2' 'bx1y2t0' 'bx1y2t1' 'bx1y2t2' 'bx1y3t0'
 'bx1y3t1' 'bx1y3t2' 'bx2y0t0' 'bx2y0t1' 'bx2y0t2' 'bx2y1t0' 'bx2y1t1'
 'bx2y1t2' 'bx2y2t0' 'bx2y2t1' 'bx2y2t2' 'bx2y3t0' 'bx2y3t1' 'bx2y3t2'
 'bx3y0t0' 'bx3y0t1' 'bx3y0t2' 'bx3y1t0' 'bx3y1t1' 'bx3y1t2' 'bx3y2t0'
 'bx3y2t1' 'bx3y2t2' 'bx3y3t0' 'bx3y3t1' 'bx3y3t2']
>>>
>>> kfold = GroupKFold(n_splits=5)
>>>
>>> # cross validate a classifier
>>> print(cross_val_score(
...         estimator=LogisticRegression(),
...         X=points.predictor.values.reshape(-1, 1),
...         y=points.target,
...         scoring='f1',
...         groups=groups, # our groups go here
... ))
[0.67549669 0.63309353 0.55084746 0.6        0.67109635]