Source code for torchgeo.datasets.openbuildings
# Copyright (c) TorchGeo Contributors. All rights reserved.
# Licensed under the MIT License.
"""Open Buildings datasets."""
import glob
import os
from collections.abc import Callable, Iterable
from typing import Any, ClassVar
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import pyproj
import rasterio
import shapely
import torch
from geopandas import GeoDataFrame
from matplotlib.figure import Figure
from pyproj import CRS
from .errors import DatasetNotFoundError
from .geo import VectorDataset
from .utils import GeoSlice, Path, check_integrity
[docs]class OpenBuildings(VectorDataset):
r"""Open Buildings dataset.
The `Open Buildings
<https://sites.research.google/open-buildings/>`__ dataset
consists of computer generated building detections across the African continent.
Dataset features:
* 516M building detections as polygons with centroid lat/long
* covering area of 19.4M km\ :sup:`2`\ (64% of the African continent)
* confidence score and
`Plus Code <https://maps.google.com/pluscodes/>`_
Dataset format:
* csv files containing building detections compressed as csv.gz
* meta data geojson file
The data can be downloaded from `here
<https://sites.research.google/open-buildings/#open-buildings-download>`__.
Additionally, the `meta data geometry file
<https://openbuildings-public-dot-gweb-research.uw.r.appspot.com/public/tiles.geojson>`_
also needs to be placed in `root` as `tiles.geojson`.
If you use this dataset in your research, please cite the following technical
report:
* https://arxiv.org/abs/2107.12283
.. versionadded:: 0.3
"""
md5s: ClassVar[dict[str, str]] = {
'025_buildings.csv.gz': '41db2572bfd08628d01475a2ee1a2f17',
'04f_buildings.csv.gz': '3232c1c6d45c1543260b77e5689fc8b1',
'05b_buildings.csv.gz': '4fc57c63bbbf9a21a3902da7adc3a670',
'093_buildings.csv.gz': '00fce146dadf0b30255e750c4c5ac2de',
'095_buildings.csv.gz': 'f5765b0936f7ccbd0b4abed60d994f08',
'0c3_buildings.csv.gz': '013b130fe872387e0cff842399b423de',
'0c3_buildings.csv': 'a697ad2433e9a9f6001de25b4664651a',
'0c5_buildings.csv.gz': '16ca283e9344e9da8b47acaf03c1c6e4',
'0c7_buildings.csv.gz': 'b3774930006497a80c8a2fbf33056610',
'0d1_buildings.csv.gz': '41e652218ca5964d297d9cd1d84b831c',
'0d7_buildings.csv.gz': 'd365fe47d10b0756dd54ceca24598d8e',
'0d9_buildings.csv.gz': '3ebd47fa4f86857266e9a7346d6aa163',
'0db_buildings.csv.gz': '368213e9caa7ee229ef9403b0ca8c80d',
'0dd_buildings.csv.gz': '8f5fcefff262fdfd82800092d2e9d841',
'0df_buildings.csv.gz': 'cbb5f63b10daa25568bdde8d9f66f8a4',
'0e1_buildings.csv.gz': 'a9b9bf1e541b62c8a34d2f6f2ae71e1c',
'0e3_buildings.csv.gz': '3d9c2ffc11c02aec2bd008699f9c4bd1',
'0e5_buildings.csv.gz': '1e1b2bf63dfc520e62e4b68db23fe64c',
'0e7_buildings.csv.gz': 'c96797588c90e66268367cb56b4b9af8',
'0e9_buildings.csv.gz': 'c53bb7bbc8140034d1be2c49ff49af68',
'0eb_buildings.csv.gz': '407c771f614a15d69d78f1e25decf694',
'0ed_buildings.csv.gz': 'bddd10992d291677019d7106ce1f4fac',
'0ef_buildings.csv.gz': 'd1b91936e7ac06c661878ef9eb5dba7b',
'0f1_buildings.csv.gz': '9d86eb10d2d8766e1385b6c52c11d5e2',
'0f9_buildings.csv.gz': '1c6775131214b26f4a27b4c42d6e9fca',
'0fb_buildings.csv.gz': 'd39528cb4e0cbff589ca89dc86d9b5db',
'0fd_buildings.csv.gz': '304fe4a60e950c900697d975098f7536',
'0ff_buildings.csv.gz': '266ca7ed1ad0251b3999b0e2e9b54648',
'103_buildings.csv.gz': '8d3cafab5f1e02b2a0a6180eb34d1cac',
'105_buildings.csv.gz': 'dd61cc74239aa9a1b30f10859122807b',
'107_buildings.csv.gz': '823c05984f859a1bf17af8ce78bf2892',
'109_buildings.csv.gz': 'cfdee0e807168cd1c183d9c01535369b',
'10b_buildings.csv.gz': 'd8ecaf406abd864b641ba34985f3042e',
'10d_buildings.csv.gz': 'af584a542a17942ff7e94653322dba87',
'10f_buildings.csv.gz': '3d5369e15c4d1f59fb38cf61f4e6290b',
'111_buildings.csv.gz': '47504e43d1b67101bed5d924225328dc',
'113_buildings.csv.gz': '3f991c831569f91f34eaa8fc3882b2fd',
'117_buildings.csv.gz': 'a4145fa6e458480e30c807f80ae5cd65',
'119_buildings.csv.gz': '5661b7ac23f266542c7e0d962a8cae58',
'11b_buildings.csv.gz': '41b6d036610d0bddac069ec72e68710e',
'11d_buildings.csv.gz': '1ef75e9d176dd8d6bfa6012d36b1d25c',
'11f_buildings.csv.gz': 'f004873d1ef3933c1716ab6409565b7d',
'121_buildings.csv.gz': '0c7e7a9043ed069fbdefdcfcfc437482',
'123_buildings.csv.gz': 'c46bd53b67025c3de11657220cce0aec',
'125_buildings.csv.gz': '33253ae1a82656f4eedca9bd86f981a3',
'127_buildings.csv.gz': '2f827f8fc93485572178e9ad0c65e22d',
'129_buildings.csv.gz': '74f98346990a1d1e41241ce8f4bb201a',
'12f_buildings.csv.gz': 'b1b0777296df2bfef512df0945ca3e14',
'131_buildings.csv.gz': '8362825b10c9396ecbb85c49cd210bc6',
'137_buildings.csv.gz': '96da7389df820405b0010db4a6c98c61',
'139_buildings.csv.gz': 'c41e26fc6f3565c3d7c66ab977dc8159',
'13b_buildings.csv.gz': '981d4ccb0f41a103bdad8ef949eb4ffe',
'13d_buildings.csv.gz': 'd15585d06ee74b0095842dd887197035',
'141_buildings.csv.gz': 'ae0bf17778d45119c74e50e06a04020d',
'143_buildings.csv.gz': '9699809e57eb097dfaf9d484f1d9c5fa',
'145_buildings.csv.gz': '81e74e0165ea358278ce18507dddfdb0',
'147_buildings.csv.gz': '39edad15fa16c432f5d460f0a8166032',
'149_buildings.csv.gz': '94bf8f8fa221744fb1d57c7d4065e69e',
'14f_buildings.csv.gz': 'ca8410be89b5cf868c2a67861712e4ea',
'15b_buildings.csv.gz': '8c0071c0ae20a60e8dd4d7aa6aac5a99',
'15d_buildings.csv.gz': '35f044a323556adda5f31e8fc9307c85',
'161_buildings.csv.gz': 'ba08b70a26f07b5e2cd4eafd9d6f826b',
'163_buildings.csv.gz': '2bec83a2504b531cd1cb0311fcb6c952',
'165_buildings.csv.gz': '48f934733dd3054164f9b09abee63312',
'167_buildings.csv.gz': 'bba8657024d80d44e475759b65adc969',
'169_buildings.csv.gz': '13e142e48597ee7a8b0b812e226dfa72',
'16b_buildings.csv.gz': '9c62351d6cc8eaf761ab89d4586d26d6',
'16d_buildings.csv.gz': 'a33c23da3f603c8c3eacc5e6a47aaf66',
'16f_buildings.csv.gz': '4850dd7c8f0fb628ba5864ea9f47647b',
'171_buildings.csv.gz': '4217f1b025db869c8bed1014704c2a79',
'173_buildings.csv.gz': '5a5f3f07e261a9dc58c6180b69130e4a',
'175_buildings.csv.gz': '5bbf7a7c8f57d28e024ddf8f4039b575',
'177_buildings.csv.gz': '76cd4b17d68d62e1f088f229b65f8acf',
'179_buildings.csv.gz': 'a5a1c6609483336ddff91b2385e70eb9',
'17b_buildings.csv.gz': 'a47c1145a3b0bcdaba18c153b7b92b87',
'17d_buildings.csv.gz': '3226d0abf396f44c1a436be83538dfd8',
'17f_buildings.csv.gz': '3e18d4fc5837ee89274d30f2126b92b2',
'181_buildings.csv.gz': 'c87639d7f6d6a85a3fa6b06910b0e145',
'183_buildings.csv.gz': 'e94438ebf19b3b25035954d23a0e90cf',
'185_buildings.csv.gz': '8de8d1d50c16c575f85b96dee474cb56',
'189_buildings.csv.gz': 'da94cd495a99496fd687bbb4a1715c90',
'18b_buildings.csv.gz': '9ab353335fe6ff694e834889be2b305d',
'18d_buildings.csv.gz': 'e37e0f868ce96f7d14f7bf1a301da1d3',
'18f_buildings.csv.gz': 'e9000b9ef9bb0f838088e96becfc95a1',
'191_buildings.csv.gz': 'c00bb4d6b2b12615d576c06fe545cbfa',
'193_buildings.csv.gz': 'd48d4c03ef053f6987b3e6e9e78a8b03',
'195_buildings.csv.gz': 'd93ab833e74480f07a5ccf227067db5a',
'197_buildings.csv.gz': '8667e040f9863e43924aafe6071fabc7',
'199_buildings.csv.gz': '04ba65a4caf16cc1e0d5c4e1322c5885',
'19b_buildings.csv.gz': 'e49412e3e1bccceb0bdb4df5201288f4',
'19d_buildings.csv.gz': '92b5fb4e96529d90e99c788e3e8696d4',
'19f_buildings.csv.gz': 'c023f6c37d0026b56f530b841517a6cd',
'1a1_buildings.csv.gz': '471483b50c722af104af8a582e780c04',
'1a3_buildings.csv.gz': '0a453053f1ff53f9e165e16c7f97354a',
'1a5_buildings.csv.gz': '1f6a823e223d5f29c66aa728933de684',
'1a7_buildings.csv.gz': '6130b724501fa16e6d84e484c4091f1f',
'1a9_buildings.csv.gz': '73022e8e7b994e76a58cc763a057d542',
'1b9_buildings.csv.gz': '48dea4af9d12b755e75b76c68c47de6b',
'1bb_buildings.csv.gz': 'dfb9ee4d3843d81722b70f7582c775a4',
'1bd_buildings.csv.gz': 'fdea2898fc50ae25b6196048373d8244',
'1bf_buildings.csv.gz': '96ef27d6128d0bcdfa896fed6f27cdd0',
'1c1_buildings.csv.gz': '32e3667d939e7f95316eb75a6ffdb603',
'1c3_buildings.csv.gz': 'ed94b543da1bbe3101ed66f7d7727d24',
'1c5_buildings.csv.gz': 'ce527ab33e564f0cc1b63ae467932a18',
'1c7_buildings.csv.gz': 'd5fb474466d6a11d3b08e3a011984ada',
'1dd_buildings.csv.gz': '9e7e50e3f95b3f2ceff6351b75ca1e75',
'1e5_buildings.csv.gz': 'f95ea85fce47ce7edf5729086d43f922',
'1e7_buildings.csv.gz': '2bca5682c48134e69b738d70dfe7d516',
'1e9_buildings.csv.gz': 'f049ad06dbbb200f524b4f50d1df8c2e',
'1eb_buildings.csv.gz': '6822d7f202b453ec3cc03fb8f04691ad',
'1ed_buildings.csv.gz': '9dfc560e2c3d135ebdcd46fa09c47169',
'1ef_buildings.csv.gz': '506e7772c35b09cfd3b6f8691dc2947d',
'1f1_buildings.csv.gz': 'b74f2b585cfad3b881fe4f124080440a',
'1f3_buildings.csv.gz': '12896642315320e11ed9ed2d3f0e5995',
'1f5_buildings.csv.gz': '334aea21e532e178bf5c54d028158906',
'1f7_buildings.csv.gz': '0e8c3d2e005eb04c6852a8aa993f5a76',
'217_buildings.csv.gz': '296e9ba121fea752b865a48e5c0fe8a5',
'219_buildings.csv.gz': '1d19b6626d738f7706f75c2935aaaff4',
'21d_buildings.csv.gz': '28bfca1f8668f59db021d3a195994768',
'21f_buildings.csv.gz': '06325c8b0a8f6ed598b7dc6f0bb5adf2',
'221_buildings.csv.gz': 'a354ffc1f7226d525c7cf53848975da1',
'223_buildings.csv.gz': '3bda1339d561b3bc749220877f1384d9',
'225_buildings.csv.gz': '8eb02ad77919d9e551138a14d3ad1bbc',
'227_buildings.csv.gz': 'c07aceb7c81f83a653810befa0695b61',
'22f_buildings.csv.gz': '97d63e30e008ec4424f6b0641b75377c',
'231_buildings.csv.gz': 'f4bc384ed74552ddcfe2e69107b91345',
'233_buildings.csv.gz': '081756e7bdcfdc2aee9114c4cfe62bd8',
'23b_buildings.csv.gz': '75776d3dcbc90cf3a596664747880134',
'23d_buildings.csv.gz': 'e5d0b9b7b14601f58cfdb9ea170e9520',
'23f_buildings.csv.gz': '77f38466419b4d391be8e4f05207fdf5',
'3d1_buildings.csv.gz': '6659c97bd765250b0dee4b1b7ff583a9',
'3d5_buildings.csv.gz': 'c27d8f6b2808549606f00bc04d8b42bc',
'3d7_buildings.csv.gz': 'abdef2e68cc31c67dbb6e60c4c40483e',
'3d9_buildings.csv.gz': '4c06ae37d8e76626345a52a32f989de9',
'3db_buildings.csv.gz': 'e83ca0115eaf4ec0a72aaf932b00442a',
'b5b_buildings.csv.gz': '5e5f59cb17b81137d89c4bab8107e837',
}
filename_glob = '*_buildings.csv'
zipfile_glob = '*_buildings.csv.gz'
meta_data_url = 'https://sites.research.google/open-buildings/tiles.geojson'
meta_data_filename = 'tiles.geojson'
_source_crs = CRS.from_epsg(4326)
[docs] def __init__(
self,
paths: Path | Iterable[Path] = 'data',
crs: CRS | None = None,
res: float | tuple[float, float] = 0.0001,
transforms: Callable[[dict[str, Any]], dict[str, Any]] | None = None,
checksum: bool = False,
) -> None:
"""Initialize a new Dataset instance.
Args:
paths: one or more root directories to search or files to load
crs: :term:`coordinate reference system (CRS)` to warp to
(defaults to the CRS of the first file found)
res: resolution of the dataset in units of CRS in (xres, yres) format. If a
single float is provided, it is used for both the x and y resolution.
transforms: a function/transform that takes input sample and its target as
entry and returns a transformed version
checksum: if True, check the MD5 of the downloaded files (may be slow)
Raises:
DatasetNotFoundError: If dataset is not found.
.. versionchanged:: 0.5
*root* was renamed to *paths*.
"""
self.paths = paths
if isinstance(res, int | float):
res = (res, res)
self.res = res
self.checksum = checksum
self.transforms = transforms
self._verify()
assert isinstance(self.paths, str | os.PathLike)
polygon_files = glob.glob(os.path.join(self.paths, self.zipfile_glob))
polygon_filenames = [f.split(os.sep)[-1] for f in polygon_files]
filename = os.path.join(self.paths, 'tiles.geojson')
gdf = gpd.read_file(filename)
gdf.set_crs(self._source_crs, inplace=True)
# Filter to only include desired polygon files
gdf['filepath'] = gdf['tile_url'].str.split('/').str[-1]
gdf = gdf[gdf['filepath'].isin(polygon_filenames)]
# Convert geometries to bounding boxes
geometries = gdf.bounds.apply(
lambda row: shapely.box(row['minx'], row['miny'], row['maxx'], row['maxy']),
axis=1,
)
filepaths = [os.path.join(self.paths, filepath) for filepath in gdf['filepath']]
datetimes = [(pd.Timestamp.min, pd.Timestamp.max)] * len(filepaths)
if not len(filepaths):
raise DatasetNotFoundError(self)
data = {'filepath': filepaths}
index = pd.IntervalIndex.from_tuples(datetimes, closed='both', name='datetime')
self.index = GeoDataFrame(
data, index=index, geometry=list(geometries), crs=self._source_crs
)
if crs is not None and crs != self._source_crs:
self.index.to_crs(crs, inplace=True)
[docs] def __getitem__(self, query: GeoSlice) -> dict[str, Any]:
"""Retrieve input, target, and/or metadata indexed by spatiotemporal slice.
Args:
query: [xmin:xmax:xres, ymin:ymax:yres, tmin:tmax:tres] coordinates to index.
Returns:
Sample of input, target, and/or metadata at that index.
Raises:
IndexError: If *query* is not found in the index.
"""
x, y, t = self._disambiguate_slice(query)
interval = pd.Interval(t.start, t.stop)
index = self.index.iloc[self.index.index.overlaps(interval)]
index = index.iloc[:: t.step]
index = index.cx[x.start : x.stop, y.start : y.stop]
if index.empty:
raise IndexError(
f'query: {query} not found in index with bounds: {self.bounds}'
)
shapes = self._filter_geometries(query, index.filepath)
# Rasterize geometries
width = (x.stop - x.start) / x.step
height = (y.stop - y.start) / y.step
transform = rasterio.transform.from_bounds(
x.start, y.start, x.stop, y.stop, width, height
)
if shapes:
masks = rasterio.features.rasterize(
shapes, out_shape=(round(height), round(width)), transform=transform
)
masks = torch.tensor(masks).unsqueeze(0)
else:
masks = torch.zeros(size=(1, round(height), round(width)))
sample = {'mask': masks, 'crs': self.crs, 'bounds': query}
if self.transforms is not None:
sample = self.transforms(sample)
return sample
def _filter_geometries(
self, query: GeoSlice, filepaths: list[str]
) -> list[dict[str, Any]]:
"""Filters a df read from the polygon csv file based on query and conf thresh.
Args:
query: [xmin:xmax:xres, ymin:ymax:yres, tmin:tmax:tres] coordinates to index.
filepaths: filepaths to files that were hits from rmtree index
Returns:
List with all polygons from all hit filepaths
"""
x, y, _ = self._disambiguate_slice(query)
# We need to know the bounding box of the query in the source CRS
transformer = pyproj.Transformer.from_crs(
self.crs, self._source_crs, always_xy=True
)
(minx, miny) = transformer.transform(x.start, y.start)
(maxx, maxy) = transformer.transform(x.stop, y.stop)
shapes = []
for f in filepaths:
csv_chunks = pd.read_csv(f, chunksize=200000, compression='gzip')
for chunk in csv_chunks:
chunk['geometry'] = gpd.GeoSeries.from_wkt(chunk['geometry'])
gdf = gpd.GeoDataFrame(chunk, geometry='geometry', crs=self._source_crs)
gdf = gdf.cx[minx:maxx, miny:maxy]
gdf.to_crs(self.crs, inplace=True)
shapes.extend(gdf.geometry.tolist())
return shapes
def _verify(self) -> None:
"""Verify the integrity of the dataset."""
# Check if the zip files have already been downloaded and checksum
assert isinstance(self.paths, str | os.PathLike)
pathname = os.path.join(self.paths, self.zipfile_glob)
i = 0
for zipfile in glob.iglob(pathname):
filename = os.path.basename(zipfile)
if self.checksum and not check_integrity(zipfile, self.md5s[filename]):
raise RuntimeError(f'Dataset found, but corrupted: {filename}.')
i += 1
if i != 0:
return
raise DatasetNotFoundError(self)
[docs] def plot(
self,
sample: dict[str, Any],
show_titles: bool = True,
suptitle: str | None = None,
) -> Figure:
"""Plot a sample from the dataset.
Args:
sample: a sample returned by :meth:`__getitem__`
show_titles: flag indicating whether to show titles above each panel
suptitle: optional string to use as a suptitle
Returns:
a matplotlib Figure with the rendered sample
"""
mask = sample['mask'].permute(1, 2, 0)
showing_predictions = 'prediction' in sample
if showing_predictions:
pred = sample['prediction'].permute(1, 2, 0)
ncols = 2
else:
ncols = 1
fig, axs = plt.subplots(nrows=1, ncols=ncols, figsize=(ncols * 4, 4))
if showing_predictions:
axs[0].imshow(mask)
axs[0].axis('off')
axs[1].imshow(pred)
axs[1].axis('off')
if show_titles:
axs[0].set_title('Mask')
axs[1].set_title('Prediction')
else:
axs.imshow(mask)
axs.axis('off')
if show_titles:
axs.set_title('Mask')
if suptitle is not None:
plt.suptitle(suptitle)
return fig