# display multiple outputs from single cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


# required libraries
# import os, zipfile, timeit, random, statistics, urllib.request
# import rasterio, rasterio.plot, rasterio.mask, rasterio.enums
# import shapely.geometry, geopandas


import timeit
?timeit.Timer


import random
random.choices(range(1, 100), k = 5)

[6, 13, 34, 98, 48]


n = 1_000_000
t = timeit.repeat("random.choices(range(1, 100), k = n)",
                  globals = globals(), number = 1, repeat = 5)
t

[0.19214550000000008,
 0.1433049999999998,
 0.1421522000000004,
 0.14099359999999983,
 0.14098690000000014]


import statistics
round(statistics.mean(t), 2)
round(statistics.stdev(t), 4)

0.15

0.0225


%timeit -r 5 -n 1 random.choices(range(1, 100), k = n)

153 ms ± 13.2 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


%%timeit -r 5 -n 1
random.choices(range(1, 100), k = n)

154 ms ± 19.2 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


import os
import zipfile
import urllib.request

url = "https://srtm.csi.cgiar.org/wp-content/uploads/files/srtm_5x5/TIFF/srtm_39_02.zip"

if not os.path.isdir("data"):
    os.mkdir("data")
    urllib.request.urlretrieve(url, "data/srtm.zip")
    with zipfile.ZipFile("data/srtm.zip","r") as zip_ref:
        zip_ref.extractall("data") # ~70 MB tif


import rasterio


# open connection to file
raster = rasterio.open("data/srtm_39_02.tif")

# load all metadata
metadata = raster.meta

# you can also call individual attributes
# raster.bounds
# raster.res
# raster.shape

# load values
# if there are NAs in raster, then use the `masked = True` argument
matrix = raster.read(masked = True)

# close connection
raster.close()


print(metadata)

{'driver': 'GTiff', 'dtype': 'int16', 'nodata': -32768.0, 'width': 6000, 'height': 6000, 'count': 1, 'crs': CRS.from_epsg(4326), 'transform': Affine(0.0008333333333333334, 0.0, 10.0,
       0.0, -0.0008333333333333334, 55.0)}


matrix[0, 0:3, 0:3] # check another extent yourself

masked_array(
  data=[[--, --, --],
        [--, --, --],
        [--, --, --]],
  mask=[[ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True]],
  fill_value=-32768,
  dtype=int16)


# return result in MB
matrix.nbytes / 1024**2

68.66455078125


from rasterio.plot import show
show(matrix, transform = metadata["transform"])

<AxesSubplot:>


%%timeit -r 5 -n 1
raster = rasterio.open("data/srtm_39_02.tif")
matrix = raster.read(masked = True)
metadata = raster.meta
raster.close()

179 ms ± 8.9 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


import rasterio.mask
from shapely.geometry import box

ext = box(11, 14, 51, 53) # minx, miny, maxx, maxy
raster = rasterio.open("data/srtm_39_02.tif")
crop = rasterio.mask.mask(raster, [ext], crop = True)
raster.close()


show(crop[0], title = "Cropped")

<AxesSubplot:title={'center':'Cropped'}>


raster = rasterio.open("data/srtm_39_02.tif")
%timeit -r 5 -n 1 rasterio.mask.mask(raster, [ext], crop = True)
raster.close()

176 ms ± 24 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


from rasterio.enums import Resampling

# this is the resolution equivalent of 0.01 degrees in pixels
height_dest = 500
width_dest = 500

raster = rasterio.open("data/srtm_39_02.tif")
metadata = raster.meta

# resample
raster_small = raster.read(
    out_shape = (raster.count, height_dest, width_dest),
    resampling = Resampling.average,
    masked = True
)

# update resolution
new_transform = raster.transform * raster.transform.scale(
    (raster.width / raster_small.shape[-1]),
    (raster.height / raster_small.shape[-2])
)

raster.close()


raster_small.shape

(1, 500, 500)


metadata["transform"]
new_transform

Affine(0.0008333333333333334, 0.0, 10.0,
       0.0, -0.0008333333333333334, 55.0)

Affine(0.01, 0.0, 10.0,
       0.0, -0.01, 55.0)


%%timeit -r 5 -n 1
raster = rasterio.open("data/srtm_39_02.tif")
raster_small = raster.read(
    out_shape = (raster.count, height_dest, width_dest),
    resampling = Resampling.average,
    masked = True
)
new_transform = raster.transform * raster.transform.scale(
    (raster.width / raster_small.shape[-1]),
    (raster.height / raster_small.shape[-2])
)
raster.close()

301 ms ± 6.83 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


import geopandas
import random


%%capture --no-stdout

n = 200000
x, y = [], [] # create empty lists

random.seed(123)
for i in range(n):
    x.append(random.uniform(10, 15))
    y.append(random.uniform(50, 55))

pts = geopandas.GeoSeries.from_xy(x, y, crs = "EPSG:4326")
pts = pts.to_crs(3857)
pts.to_file("data/points2.gpkg")


vector = geopandas.read_file("data/points.gpkg")
vector[0:5]
vector.crs
vector.total_bounds # minx, miny, maxx, maxy

<Derived Projected CRS: EPSG:3857>
Name: WGS 84 / Pseudo-Mercator
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: World between 85.06°S and 85.06°N.
- bounds: (-180.0, -85.06, 180.0, 85.06)
Coordinate Operation:
- name: Popular Visualisation Pseudo-Mercator
- method: Popular Visualisation Pseudo Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

array([1113196.83718289, 6446280.00609634, 1669792.31861506,
       7361860.10625515])


%timeit -r 5 -n 1 geopandas.read_file("data/points.gpkg")

3.65 s ± 36.2 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


buffer = vector.buffer(50000, resolution = 5)
buffer[0:5]

0    POLYGON ((1323259.823 6987154.174, 1320812.649...
1    POLYGON ((1601963.539 6909107.966, 1599516.365...
2    POLYGON ((1390830.421 7329358.608, 1388383.247...
3    POLYGON ((1654680.147 7173921.049, 1652232.973...
4    POLYGON ((1686656.604 6869424.855, 1684209.430...
dtype: geometry


base = buffer[0:100].plot(facecolor = "none")
vector[0:100].plot(ax = base, markersize = 5, color = "blue")

<AxesSubplot:>


%timeit -r 5 -n 1 vector.buffer(50000, resolution = 5)

2.64 s ± 45.3 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


vector[0:5].distance(vector[0:5])

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
dtype: float64


v = vector[0:3000] # sample data
dist = v.geometry.apply(lambda f: v.distance(f))
dist.iloc[0:5, 0:5]


%timeit -r 5 -n 1 v.geometry.apply(lambda f: v.distance(f))

4.6 s ± 45.1 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


pts = geopandas.read_file("data/points.gpkg")
pts = pts.to_crs(4326)
# convert GeoDataFrame to list of coordinates
xy = [(x, y) for x, y in zip(pts["geometry"].x , pts["geometry"].y)]

raster = rasterio.open("data/srtm_39_02.tif")
pts["value"] = [i[0] for i in raster.sample(xy)]
raster.close()

	geometry
0	POINT (1273259.823 6987154.174)
1	POINT (1551963.539 6909107.966)
2	POINT (1340830.421 7329358.608)
3	POINT (1604680.147 7173921.049)
4	POINT (1636656.604 6869424.855)

	0	1	2	3	4
0	0.000000	289425.243896	348811.783881	380422.523368	381991.377803
1	289425.243896	0.000000	470306.066216	270009.276894	93528.950228
2	348811.783881	470306.066216	0.000000	306231.142304	546856.643188
3	380422.523368	270009.276894	306231.142304	0.000000	306170.583493
4	381991.377803	93528.950228	546856.643188	306170.583493	0.000000

Spatial Processing Benchmarks in Python¶

Krzysztof Dyba¶

Introduction¶

Part I: Raster Data¶

Data Source¶

Benchmarks¶

Data Loading¶

Cropping¶

Downsampling¶

Downsampling¶

Part II: Vector Data¶

Data Source¶

Benchmarks¶

Data Loading¶

Buffers¶

Distance¶

Excercise¶