Source code for torchgeo.datasets.gse
# Copyright (c) TorchGeo Contributors. All rights reserved.
# Licensed under the MIT License.
"""Google Satellite Embedding dataset."""
import pathlib
from datetime import datetime
import einops
import torch
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
from .geo import RasterDataset
from .utils import Path, Sample, disambiguate_timestamp
[docs]
class GoogleSatelliteEmbedding(RasterDataset):
"""Google Satellite Embedding dataset.
The `Google Satellite Embedding dataset
<https://developers.google.com/earth-engine/datasets/catalog/GOOGLE_SATELLITE_EMBEDDING_V1_ANNUAL>`__
is a global, analysis-ready collection of learned geospatial `embeddings
<https://developers.google.com/machine-learning/crash-course/embeddings/embedding-space>`__.
Each 10-meter pixel in this dataset is a 64-dimensional representation,
or "`embedding vector <https://developers.google.com/machine-learning/glossary#embedding-vector>`__",
that encodes temporal trajectories of surface conditions at and around that pixel
as measured by various Earth observation instruments and datasets, over a single
calendar year.
The dataset covers terrestrial land surfaces and shallow waters, including
intertidal and reef zones, inland waterways, and coastal waterways.
Coverage at the poles is limited by satellite orbits and instrument coverage.
The embeddings are unit-length, meaning they have a magnitude of 1 and do not
require any additional normalization, and are distributed across the unit sphere,
making them well-suited for use with clustering algorithms and tree-based
classifiers. The embedding space is also consistent across years, and embeddings
from different years can be used for condition change detection by considering the
dot product or angle between two embedding vectors. Furthermore, the embeddings
are designed to be linearly composable, i.e., they can be aggregated to produce
embeddings at coarser spatial resolutions or transformed with vector arithmetic,
and still retain their semantic meaning and distance relationships.
The Satellite Embedding dataset was produced by `AlphaEarth Foundations
<https://deepmind.google/blog/alphaearth-foundations-helps-map-our-planet-in-unprecedented-detail/>`__,
a geospatial embedding model that assimilates multiple datastreams including
optical, radar, LiDAR, and other sources.
If you use this dataset in your research, please cite the following paper:
* https://arxiv.org/abs/2507.22291
.. note::
The dataset can be downloaded from a number of locations:
* `Google Cloud Storage <https://console.cloud.google.com/storage/browser/alphaearth_foundations>`__: 2017--2024, requires a billing project
* `Source Cooperative <https://source.coop/tge-labs/aef>`__: 2018--2024
* `Hugging Face <https://huggingface.co/datasets/Major-TOM/Core-AlphaEarth-Embeddings>`__: subset matching Major TOM
.. versionadded:: 0.9
"""
# https://developers.google.com/earth-engine/datasets/catalog/GOOGLE_SATELLITE_EMBEDDING_V1_ANNUAL#bands
all_bands = tuple(f'A{n:02}' for n in range(64))
def _filepath_to_timestamp(self, filepath: Path) -> tuple[datetime, datetime]:
"""Extract minimum and maximum timestamps from the filepath.
Args:
filepath: Full path to the file.
Returns:
(mint, maxt) tuple.
"""
# Example file paths:
#
# * GCS/SC: 2024/10N/x086q72fv2f9q1x4a-0000000000-0000000000.tiff
# * HF: 2024/U/1/L/7/471U_587L.tif
date_format = '%Y'
for part in pathlib.Path(filepath).parts[::-1]:
try:
return disambiguate_timestamp(part, date_format)
except ValueError:
pass
return self.mint, self.maxt
[docs]
def plot(
self, sample: Sample, show_titles: bool = True, suptitle: str | None = None
) -> Figure:
"""Plot a sample from the dataset.
.. warning::
Visualizations are generated using PCA on each image *individually*, and
are thus not comparable across images. The plot method is provided for
visualization purposes only and should not be used to draw conclusions.
Args:
sample: a sample returned by :meth:`RasterDataset.__getitem__`
show_titles: flag indicating whether to show titles above each panel
suptitle: optional string to use as a suptitle
Returns:
a matplotlib Figure with the rendered sample
"""
_, h, w = sample['image'].shape
A = einops.rearrange(sample['image'], 'c h w -> (h w) c')
# Use PCA to project embeddings from 64D to 3D space
_, _, V = torch.pca_lowrank(A, q=3)
B = A @ V
B -= B.min(dim=0, keepdim=True)[0]
B /= B.max(dim=0, keepdim=True)[0]
image = einops.rearrange(B, '(h w) c -> h w c', h=h, w=w)
fig, ax = plt.subplots()
ax.imshow(image)
ax.axis('off')
if show_titles:
ax.set_title('Embedding')
if suptitle is not None:
plt.suptitle(suptitle)
return fig